2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 note = u'Downloading video webpage'
118 if note is not False:
119 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
121 return compat_urllib_request.urlopen(url_or_request)
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
124 errnote = u'Unable to download webpage'
125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
127 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns the data of the page as a string """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
133 encoding = m.group(1)
136 webpage_bytes = urlh.read()
137 if self._downloader.params.get('dump_intermediate_pages', False):
139 url = url_or_request.get_full_url()
140 except AttributeError:
142 self._downloader.to_screen(u'Dumping request to ' + url)
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
145 return webpage_bytes.decode(encoding, 'replace')
147 #Methods for following #608
148 #They set the correct value of the '_type' key
149 def video_result(self, video_info):
150 """Returns a video"""
151 video_info['_type'] = 'video'
153 def url_result(self, url, ie=None):
154 """Returns a url that points to a page that should be processed"""
155 #TODO: ie should be the class used for getting the info
156 video_info = {'_type': 'url',
159 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
160 """Returns a playlist"""
161 video_info = {'_type': 'playlist',
164 video_info['id'] = playlist_id
166 video_info['title'] = playlist_title
170 class YoutubeIE(InfoExtractor):
171 """Information extractor for youtube.com."""
175 (?:https?://)? # http(s):// (optional)
176 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
177 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
178 (?:.*?\#/)? # handle anchor (#/) redirect urls
179 (?: # the various things that can precede the ID:
180 (?:(?:v|embed|e)/) # v/ or embed/ or e/
181 |(?: # or the v= param in all its forms
182 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
183 (?:\?|\#!?) # the params delimiter ? or # or #!
184 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
187 )? # optional -> youtube.com/xxxx is OK
188 )? # all until now is optional -> you can pass the naked ID
189 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
190 (?(1).+)? # if we found the ID, everything can follow
192 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
193 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
194 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
195 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
196 _NETRC_MACHINE = 'youtube'
197 # Listed in order of quality
198 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
199 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
200 _video_extensions = {
206 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
212 _video_dimensions = {
231 def suitable(cls, url):
232 """Receives a URL and returns True if suitable for this IE."""
233 if YoutubePlaylistIE.suitable(url): return False
234 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
236 def report_lang(self):
237 """Report attempt to set language."""
238 self._downloader.to_screen(u'[youtube] Setting language')
240 def report_login(self):
241 """Report attempt to log in."""
242 self._downloader.to_screen(u'[youtube] Logging in')
244 def report_age_confirmation(self):
245 """Report attempt to confirm age."""
246 self._downloader.to_screen(u'[youtube] Confirming age')
248 def report_video_webpage_download(self, video_id):
249 """Report attempt to download video webpage."""
250 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
252 def report_video_info_webpage_download(self, video_id):
253 """Report attempt to download video info webpage."""
254 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
256 def report_video_subtitles_download(self, video_id):
257 """Report attempt to download video info webpage."""
258 self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
260 def report_video_subtitles_request(self, video_id, sub_lang, format):
261 """Report attempt to download video info webpage."""
262 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
264 def report_video_subtitles_available(self, video_id, sub_lang_list):
265 """Report available subtitles."""
266 sub_lang = ",".join(list(sub_lang_list.keys()))
267 self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
269 def report_information_extraction(self, video_id):
270 """Report attempt to extract video information."""
271 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
273 def report_unavailable_format(self, video_id, format):
274 """Report extracted video URL."""
275 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
277 def report_rtmp_download(self):
278 """Indicate the download will use the RTMP protocol."""
279 self._downloader.to_screen(u'[youtube] RTMP download detected')
281 def _get_available_subtitles(self, video_id):
282 self.report_video_subtitles_download(video_id)
283 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
285 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
286 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
287 return (u'unable to download video subtitles: %s' % compat_str(err), None)
288 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
289 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
290 if not sub_lang_list:
291 return (u'video doesn\'t have subtitles', None)
294 def _list_available_subtitles(self, video_id):
295 sub_lang_list = self._get_available_subtitles(video_id)
296 self.report_video_subtitles_available(video_id, sub_lang_list)
298 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
301 (error_message, sub_lang, sub)
303 self.report_video_subtitles_request(video_id, sub_lang, format)
304 params = compat_urllib_parse.urlencode({
310 url = 'http://www.youtube.com/api/timedtext?' + params
312 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
313 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
314 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
316 return (u'Did not fetch video subtitles', None, None)
317 return (None, sub_lang, sub)
319 def _extract_subtitle(self, video_id):
321 Return a list with a tuple:
322 [(error_message, sub_lang, sub)]
324 sub_lang_list = self._get_available_subtitles(video_id)
325 sub_format = self._downloader.params.get('subtitlesformat')
326 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
327 return [(sub_lang_list[0], None, None)]
328 if self._downloader.params.get('subtitleslang', False):
329 sub_lang = self._downloader.params.get('subtitleslang')
330 elif 'en' in sub_lang_list:
333 sub_lang = list(sub_lang_list.keys())[0]
334 if not sub_lang in sub_lang_list:
335 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
337 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
340 def _extract_all_subtitles(self, video_id):
341 sub_lang_list = self._get_available_subtitles(video_id)
342 sub_format = self._downloader.params.get('subtitlesformat')
343 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
344 return [(sub_lang_list[0], None, None)]
346 for sub_lang in sub_lang_list:
347 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
348 subtitles.append(subtitle)
351 def _print_formats(self, formats):
352 print('Available formats:')
354 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
356 def _real_initialize(self):
357 if self._downloader is None:
362 downloader_params = self._downloader.params
364 # Attempt to use provided username and password or .netrc data
365 if downloader_params.get('username', None) is not None:
366 username = downloader_params['username']
367 password = downloader_params['password']
368 elif downloader_params.get('usenetrc', False):
370 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
375 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
376 except (IOError, netrc.NetrcParseError) as err:
377 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
381 request = compat_urllib_request.Request(self._LANG_URL)
384 compat_urllib_request.urlopen(request).read()
385 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
386 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
389 # No authentication to be performed
393 request = compat_urllib_request.Request(self._LOGIN_URL)
395 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
396 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
397 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
402 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
404 galx = match.group(1)
406 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
412 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
416 u'PersistentCookie': u'yes',
418 u'bgresponse': u'js_disabled',
419 u'checkConnection': u'',
420 u'checkedDomains': u'youtube',
426 u'signIn': u'Sign in',
428 u'service': u'youtube',
432 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
434 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
435 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
436 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
439 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
440 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
441 self._downloader.report_warning(u'unable to log in: bad username or password')
443 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
444 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
450 'action_confirm': 'Confirm',
452 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
454 self.report_age_confirmation()
455 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
456 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
457 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
460 def _extract_id(self, url):
461 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
463 self._downloader.report_error(u'invalid URL: %s' % url)
465 video_id = mobj.group(2)
468 def _real_extract(self, url):
469 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
470 mobj = re.search(self._NEXT_URL_RE, url)
472 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
473 video_id = self._extract_id(url)
476 self.report_video_webpage_download(video_id)
477 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
478 request = compat_urllib_request.Request(url)
480 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
481 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
482 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
485 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
487 # Attempt to extract SWF player URL
488 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
490 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
495 self.report_video_info_webpage_download(video_id)
496 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
497 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
498 % (video_id, el_type))
499 video_info_webpage = self._download_webpage(video_info_url, video_id,
501 errnote='unable to download video info webpage')
502 video_info = compat_parse_qs(video_info_webpage)
503 if 'token' in video_info:
505 if 'token' not in video_info:
506 if 'reason' in video_info:
507 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
509 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
512 # Check for "rental" videos
513 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
514 self._downloader.report_error(u'"rental" videos not supported')
517 # Start extracting information
518 self.report_information_extraction(video_id)
521 if 'author' not in video_info:
522 self._downloader.report_error(u'unable to extract uploader name')
524 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
527 video_uploader_id = None
528 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
530 video_uploader_id = mobj.group(1)
532 self._downloader.report_warning(u'unable to extract uploader nickname')
535 if 'title' not in video_info:
536 self._downloader.report_error(u'unable to extract video title')
538 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
541 if 'thumbnail_url' not in video_info:
542 self._downloader.report_warning(u'unable to extract video thumbnail')
544 else: # don't panic if we can't find it
545 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
549 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
551 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
552 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
553 for expression in format_expressions:
555 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
560 video_description = get_element_by_id("eow-description", video_webpage)
561 if video_description:
562 video_description = clean_html(video_description)
564 video_description = ''
567 video_subtitles = None
569 if self._downloader.params.get('writesubtitles', False):
570 video_subtitles = self._extract_subtitle(video_id)
572 (sub_error, sub_lang, sub) = video_subtitles[0]
574 self._downloader.report_error(sub_error)
576 if self._downloader.params.get('allsubtitles', False):
577 video_subtitles = self._extract_all_subtitles(video_id)
578 for video_subtitle in video_subtitles:
579 (sub_error, sub_lang, sub) = video_subtitle
581 self._downloader.report_error(sub_error)
583 if self._downloader.params.get('listsubtitles', False):
584 sub_lang_list = self._list_available_subtitles(video_id)
587 if 'length_seconds' not in video_info:
588 self._downloader.report_warning(u'unable to extract video duration')
591 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
594 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
596 # Decide which formats to download
597 req_format = self._downloader.params.get('format', None)
599 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
600 self.report_rtmp_download()
601 video_url_list = [(None, video_info['conn'][0])]
602 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
603 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
604 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
605 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
606 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
608 format_limit = self._downloader.params.get('format_limit', None)
609 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
610 if format_limit is not None and format_limit in available_formats:
611 format_list = available_formats[available_formats.index(format_limit):]
613 format_list = available_formats
614 existing_formats = [x for x in format_list if x in url_map]
615 if len(existing_formats) == 0:
616 self._downloader.report_error(u'no known formats available for video')
618 if self._downloader.params.get('listformats', None):
619 self._print_formats(existing_formats)
621 if req_format is None or req_format == 'best':
622 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
623 elif req_format == 'worst':
624 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
625 elif req_format in ('-1', 'all'):
626 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
628 # Specific formats. We pick the first in a slash-delimeted sequence.
629 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
630 req_formats = req_format.split('/')
631 video_url_list = None
632 for rf in req_formats:
634 video_url_list = [(rf, url_map[rf])]
636 if video_url_list is None:
637 self._downloader.report_error(u'requested format not available')
640 self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
644 for format_param, video_real_url in video_url_list:
646 video_extension = self._video_extensions.get(format_param, 'flv')
648 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
649 self._video_dimensions.get(format_param, '???'))
653 'url': video_real_url,
654 'uploader': video_uploader,
655 'uploader_id': video_uploader_id,
656 'upload_date': upload_date,
657 'title': video_title,
658 'ext': video_extension,
659 'format': video_format,
660 'thumbnail': video_thumbnail,
661 'description': video_description,
662 'player_url': player_url,
663 'subtitles': video_subtitles,
664 'duration': video_duration
669 class MetacafeIE(InfoExtractor):
670 """Information Extractor for metacafe.com."""
672 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
673 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
674 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
675 IE_NAME = u'metacafe'
677 def __init__(self, downloader=None):
678 InfoExtractor.__init__(self, downloader)
680 def report_disclaimer(self):
681 """Report disclaimer retrieval."""
682 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
684 def report_age_confirmation(self):
685 """Report attempt to confirm age."""
686 self._downloader.to_screen(u'[metacafe] Confirming age')
688 def report_download_webpage(self, video_id):
689 """Report webpage download."""
690 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
692 def report_extraction(self, video_id):
693 """Report information extraction."""
694 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
696 def _real_initialize(self):
697 # Retrieve disclaimer
698 request = compat_urllib_request.Request(self._DISCLAIMER)
700 self.report_disclaimer()
701 disclaimer = compat_urllib_request.urlopen(request).read()
702 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
703 self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
709 'submit': "Continue - I'm over 18",
711 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
713 self.report_age_confirmation()
714 disclaimer = compat_urllib_request.urlopen(request).read()
715 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
716 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
719 def _real_extract(self, url):
720 # Extract id and simplified title from URL
721 mobj = re.match(self._VALID_URL, url)
723 self._downloader.report_error(u'invalid URL: %s' % url)
726 video_id = mobj.group(1)
728 # Check if video comes from YouTube
729 mobj2 = re.match(r'^yt-(.*)$', video_id)
730 if mobj2 is not None:
731 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1))]
733 # Retrieve video webpage to extract further information
734 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
735 webpage = self._download_webpage(request, video_id)
737 # Extract URL, uploader and title from webpage
738 self.report_extraction(video_id)
739 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
741 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
742 video_extension = mediaURL[-3:]
744 # Extract gdaKey if available
745 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
749 gdaKey = mobj.group(1)
750 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
752 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
754 self._downloader.report_error(u'unable to extract media URL')
756 vardict = compat_parse_qs(mobj.group(1))
757 if 'mediaData' not in vardict:
758 self._downloader.report_error(u'unable to extract media URL')
760 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
762 self._downloader.report_error(u'unable to extract media URL')
764 mediaURL = mobj.group(1).replace('\\/', '/')
765 video_extension = mediaURL[-3:]
766 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
768 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
770 self._downloader.report_error(u'unable to extract title')
772 video_title = mobj.group(1).decode('utf-8')
774 mobj = re.search(r'submitter=(.*?);', webpage)
776 self._downloader.report_error(u'unable to extract uploader nickname')
778 video_uploader = mobj.group(1)
781 'id': video_id.decode('utf-8'),
782 'url': video_url.decode('utf-8'),
783 'uploader': video_uploader.decode('utf-8'),
785 'title': video_title,
786 'ext': video_extension.decode('utf-8'),
790 class DailymotionIE(InfoExtractor):
791 """Information Extractor for Dailymotion"""
793 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
794 IE_NAME = u'dailymotion'
797 def __init__(self, downloader=None):
798 InfoExtractor.__init__(self, downloader)
800 def report_extraction(self, video_id):
801 """Report information extraction."""
802 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
804 def _real_extract(self, url):
805 # Extract id and simplified title from URL
806 mobj = re.match(self._VALID_URL, url)
808 self._downloader.report_error(u'invalid URL: %s' % url)
811 video_id = mobj.group(1).split('_')[0].split('?')[0]
813 video_extension = 'mp4'
815 # Retrieve video webpage to extract further information
816 request = compat_urllib_request.Request(url)
817 request.add_header('Cookie', 'family_filter=off')
818 webpage = self._download_webpage(request, video_id)
820 # Extract URL, uploader and title from webpage
821 self.report_extraction(video_id)
822 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
824 self._downloader.report_error(u'unable to extract media URL')
826 flashvars = compat_urllib_parse.unquote(mobj.group(1))
828 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
831 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
834 self._downloader.report_error(u'unable to extract video URL')
837 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
839 self._downloader.report_error(u'unable to extract video URL')
842 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
844 # TODO: support choosing qualities
846 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
848 self._downloader.report_error(u'unable to extract title')
850 video_title = unescapeHTML(mobj.group('title'))
852 video_uploader = None
853 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
855 # lookin for official user
856 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
857 if mobj_official is None:
858 self._downloader.report_warning(u'unable to extract uploader nickname')
860 video_uploader = mobj_official.group(1)
862 video_uploader = mobj.group(1)
864 video_upload_date = None
865 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
867 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
872 'uploader': video_uploader,
873 'upload_date': video_upload_date,
874 'title': video_title,
875 'ext': video_extension,
879 class PhotobucketIE(InfoExtractor):
880 """Information extractor for photobucket.com."""
882 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
883 IE_NAME = u'photobucket'
885 def __init__(self, downloader=None):
886 InfoExtractor.__init__(self, downloader)
888 def report_download_webpage(self, video_id):
889 """Report webpage download."""
890 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
892 def report_extraction(self, video_id):
893 """Report information extraction."""
894 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
896 def _real_extract(self, url):
897 # Extract id from URL
898 mobj = re.match(self._VALID_URL, url)
900 self._downloader.report_error(u'Invalid URL: %s' % url)
903 video_id = mobj.group(1)
905 video_extension = 'flv'
907 # Retrieve video webpage to extract further information
908 request = compat_urllib_request.Request(url)
910 self.report_download_webpage(video_id)
911 webpage = compat_urllib_request.urlopen(request).read()
912 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
913 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
916 # Extract URL, uploader, and title from webpage
917 self.report_extraction(video_id)
918 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
920 self._downloader.report_error(u'unable to extract media URL')
922 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
926 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
928 self._downloader.report_error(u'unable to extract title')
930 video_title = mobj.group(1).decode('utf-8')
932 video_uploader = mobj.group(2).decode('utf-8')
935 'id': video_id.decode('utf-8'),
936 'url': video_url.decode('utf-8'),
937 'uploader': video_uploader,
939 'title': video_title,
940 'ext': video_extension.decode('utf-8'),
944 class YahooIE(InfoExtractor):
945 """Information extractor for video.yahoo.com."""
948 # _VALID_URL matches all Yahoo! Video URLs
949 # _VPAGE_URL matches only the extractable '/watch/' URLs
950 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
951 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
952 IE_NAME = u'video.yahoo'
954 def __init__(self, downloader=None):
955 InfoExtractor.__init__(self, downloader)
957 def report_download_webpage(self, video_id):
958 """Report webpage download."""
959 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
961 def report_extraction(self, video_id):
962 """Report information extraction."""
963 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
965 def _real_extract(self, url, new_video=True):
966 # Extract ID from URL
967 mobj = re.match(self._VALID_URL, url)
969 self._downloader.report_error(u'Invalid URL: %s' % url)
972 video_id = mobj.group(2)
973 video_extension = 'flv'
975 # Rewrite valid but non-extractable URLs as
976 # extractable English language /watch/ URLs
977 if re.match(self._VPAGE_URL, url) is None:
978 request = compat_urllib_request.Request(url)
980 webpage = compat_urllib_request.urlopen(request).read()
981 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
982 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
985 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
987 self._downloader.report_error(u'Unable to extract id field')
989 yahoo_id = mobj.group(1)
991 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
993 self._downloader.report_error(u'Unable to extract vid field')
995 yahoo_vid = mobj.group(1)
997 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
998 return self._real_extract(url, new_video=False)
1000 # Retrieve video webpage to extract further information
1001 request = compat_urllib_request.Request(url)
1003 self.report_download_webpage(video_id)
1004 webpage = compat_urllib_request.urlopen(request).read()
1005 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1006 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1009 # Extract uploader and title from webpage
1010 self.report_extraction(video_id)
1011 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1013 self._downloader.report_error(u'unable to extract video title')
1015 video_title = mobj.group(1).decode('utf-8')
1017 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1019 self._downloader.report_error(u'unable to extract video uploader')
1021 video_uploader = mobj.group(1).decode('utf-8')
1023 # Extract video thumbnail
1024 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1026 self._downloader.report_error(u'unable to extract video thumbnail')
1028 video_thumbnail = mobj.group(1).decode('utf-8')
1030 # Extract video description
1031 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1033 self._downloader.report_error(u'unable to extract video description')
1035 video_description = mobj.group(1).decode('utf-8')
1036 if not video_description:
1037 video_description = 'No description available.'
1039 # Extract video height and width
1040 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1042 self._downloader.report_error(u'unable to extract video height')
1044 yv_video_height = mobj.group(1)
1046 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1048 self._downloader.report_error(u'unable to extract video width')
1050 yv_video_width = mobj.group(1)
1052 # Retrieve video playlist to extract media URL
1053 # I'm not completely sure what all these options are, but we
1054 # seem to need most of them, otherwise the server sends a 401.
1055 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1056 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1057 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1058 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1059 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1061 self.report_download_webpage(video_id)
1062 webpage = compat_urllib_request.urlopen(request).read()
1063 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1064 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1067 # Extract media URL from playlist XML
1068 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1070 self._downloader.report_error(u'Unable to extract media URL')
1072 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1073 video_url = unescapeHTML(video_url)
1076 'id': video_id.decode('utf-8'),
1078 'uploader': video_uploader,
1079 'upload_date': None,
1080 'title': video_title,
1081 'ext': video_extension.decode('utf-8'),
1082 'thumbnail': video_thumbnail.decode('utf-8'),
1083 'description': video_description,
1087 class VimeoIE(InfoExtractor):
1088 """Information extractor for vimeo.com."""
1090 # _VALID_URL matches Vimeo URLs
1091 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1094 def __init__(self, downloader=None):
1095 InfoExtractor.__init__(self, downloader)
1097 def report_download_webpage(self, video_id):
1098 """Report webpage download."""
1099 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1101 def report_extraction(self, video_id):
1102 """Report information extraction."""
1103 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1105 def _real_extract(self, url, new_video=True):
1106 # Extract ID from URL
1107 mobj = re.match(self._VALID_URL, url)
1109 self._downloader.report_error(u'Invalid URL: %s' % url)
1112 video_id = mobj.group('id')
1113 if not mobj.group('proto'):
1114 url = 'https://' + url
1115 if mobj.group('direct_link'):
1116 url = 'https://vimeo.com/' + video_id
1118 # Retrieve video webpage to extract further information
1119 request = compat_urllib_request.Request(url, None, std_headers)
1121 self.report_download_webpage(video_id)
1122 webpage_bytes = compat_urllib_request.urlopen(request).read()
1123 webpage = webpage_bytes.decode('utf-8')
1124 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1125 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1128 # Now we begin extracting as much information as we can from what we
1129 # retrieved. First we extract the information common to all extractors,
1130 # and latter we extract those that are Vimeo specific.
1131 self.report_extraction(video_id)
1133 # Extract the config JSON
1135 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1136 config = json.loads(config)
1138 self._downloader.report_error(u'unable to extract info section')
1142 video_title = config["video"]["title"]
1144 # Extract uploader and uploader_id
1145 video_uploader = config["video"]["owner"]["name"]
1146 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1148 # Extract video thumbnail
1149 video_thumbnail = config["video"]["thumbnail"]
1151 # Extract video description
1152 video_description = get_element_by_attribute("itemprop", "description", webpage)
1153 if video_description: video_description = clean_html(video_description)
1154 else: video_description = u''
1156 # Extract upload date
1157 video_upload_date = None
1158 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1159 if mobj is not None:
1160 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1162 # Vimeo specific: extract request signature and timestamp
1163 sig = config['request']['signature']
1164 timestamp = config['request']['timestamp']
1166 # Vimeo specific: extract video codec and quality information
1167 # First consider quality, then codecs, then take everything
1168 # TODO bind to format param
1169 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1170 files = { 'hd': [], 'sd': [], 'other': []}
1171 for codec_name, codec_extension in codecs:
1172 if codec_name in config["video"]["files"]:
1173 if 'hd' in config["video"]["files"][codec_name]:
1174 files['hd'].append((codec_name, codec_extension, 'hd'))
1175 elif 'sd' in config["video"]["files"][codec_name]:
1176 files['sd'].append((codec_name, codec_extension, 'sd'))
1178 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1180 for quality in ('hd', 'sd', 'other'):
1181 if len(files[quality]) > 0:
1182 video_quality = files[quality][0][2]
1183 video_codec = files[quality][0][0]
1184 video_extension = files[quality][0][1]
1185 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1188 self._downloader.report_error(u'no known codec found')
1191 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1192 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1197 'uploader': video_uploader,
1198 'uploader_id': video_uploader_id,
1199 'upload_date': video_upload_date,
1200 'title': video_title,
1201 'ext': video_extension,
1202 'thumbnail': video_thumbnail,
1203 'description': video_description,
1207 class ArteTvIE(InfoExtractor):
1208 """arte.tv information extractor."""
1210 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1211 _LIVE_URL = r'index-[0-9]+\.html$'
1213 IE_NAME = u'arte.tv'
1215 def __init__(self, downloader=None):
1216 InfoExtractor.__init__(self, downloader)
1218 def report_download_webpage(self, video_id):
1219 """Report webpage download."""
1220 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1222 def report_extraction(self, video_id):
1223 """Report information extraction."""
1224 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1226 def fetch_webpage(self, url):
1227 request = compat_urllib_request.Request(url)
1229 self.report_download_webpage(url)
1230 webpage = compat_urllib_request.urlopen(request).read()
1231 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1232 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1234 except ValueError as err:
1235 self._downloader.report_error(u'Invalid URL: %s' % url)
1239 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1240 page = self.fetch_webpage(url)
1241 mobj = re.search(regex, page, regexFlags)
1245 self._downloader.report_error(u'Invalid URL: %s' % url)
1248 for (i, key, err) in matchTuples:
1249 if mobj.group(i) is None:
1250 self._downloader.trouble(err)
1253 info[key] = mobj.group(i)
1257 def extractLiveStream(self, url):
1258 video_lang = url.split('/')[-4]
1259 info = self.grep_webpage(
1261 r'src="(.*?/videothek_js.*?\.js)',
1264 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1267 http_host = url.split('/')[2]
1268 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1269 info = self.grep_webpage(
1271 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1272 '(http://.*?\.swf).*?' +
1276 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1277 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1278 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1281 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1283 def extractPlus7Stream(self, url):
1284 video_lang = url.split('/')[-3]
1285 info = self.grep_webpage(
1287 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1290 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1293 next_url = compat_urllib_parse.unquote(info.get('url'))
1294 info = self.grep_webpage(
1296 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1299 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1302 next_url = compat_urllib_parse.unquote(info.get('url'))
1304 info = self.grep_webpage(
1306 r'<video id="(.*?)".*?>.*?' +
1307 '<name>(.*?)</name>.*?' +
1308 '<dateVideo>(.*?)</dateVideo>.*?' +
1309 '<url quality="hd">(.*?)</url>',
1312 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1313 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1314 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1315 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1320 'id': info.get('id'),
1321 'url': compat_urllib_parse.unquote(info.get('url')),
1322 'uploader': u'arte.tv',
1323 'upload_date': info.get('date'),
1324 'title': info.get('title').decode('utf-8'),
1330 def _real_extract(self, url):
1331 video_id = url.split('/')[-1]
1332 self.report_extraction(video_id)
1334 if re.search(self._LIVE_URL, video_id) is not None:
1335 self.extractLiveStream(url)
1338 info = self.extractPlus7Stream(url)
1343 class GenericIE(InfoExtractor):
1344 """Generic last-resort information extractor."""
1347 IE_NAME = u'generic'
1349 def __init__(self, downloader=None):
1350 InfoExtractor.__init__(self, downloader)
1352 def report_download_webpage(self, video_id):
1353 """Report webpage download."""
1354 if not self._downloader.params.get('test', False):
1355 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1356 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1358 def report_extraction(self, video_id):
1359 """Report information extraction."""
1360 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1362 def report_following_redirect(self, new_url):
1363 """Report information extraction."""
1364 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1366 def _test_redirect(self, url):
1367 """Check if it is a redirect, like url shorteners, in case return the new url."""
1368 class HeadRequest(compat_urllib_request.Request):
1369 def get_method(self):
1372 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1374 Subclass the HTTPRedirectHandler to make it use our
1375 HeadRequest also on the redirected URL
1377 def redirect_request(self, req, fp, code, msg, headers, newurl):
1378 if code in (301, 302, 303, 307):
1379 newurl = newurl.replace(' ', '%20')
1380 newheaders = dict((k,v) for k,v in req.headers.items()
1381 if k.lower() not in ("content-length", "content-type"))
1382 return HeadRequest(newurl,
1384 origin_req_host=req.get_origin_req_host(),
1387 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1389 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1391 Fallback to GET if HEAD is not allowed (405 HTTP error)
1393 def http_error_405(self, req, fp, code, msg, headers):
1397 newheaders = dict((k,v) for k,v in req.headers.items()
1398 if k.lower() not in ("content-length", "content-type"))
1399 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1401 origin_req_host=req.get_origin_req_host(),
1405 opener = compat_urllib_request.OpenerDirector()
1406 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1407 HTTPMethodFallback, HEADRedirectHandler,
1408 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1409 opener.add_handler(handler())
1411 response = opener.open(HeadRequest(url))
1412 new_url = response.geturl()
1417 self.report_following_redirect(new_url)
1420 def _real_extract(self, url):
1421 new_url = self._test_redirect(url)
1422 if new_url: return [self.url_result(new_url)]
1424 video_id = url.split('/')[-1]
1426 webpage = self._download_webpage(url, video_id)
1427 except ValueError as err:
1428 # since this is the last-resort InfoExtractor, if
1429 # this error is thrown, it'll be thrown here
1430 self._downloader.report_error(u'Invalid URL: %s' % url)
1433 self.report_extraction(video_id)
1434 # Start with something easy: JW Player in SWFObject
1435 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1437 # Broaden the search a little bit
1438 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1440 # Broaden the search a little bit: JWPlayer JS loader
1441 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1443 self._downloader.report_error(u'Invalid URL: %s' % url)
1446 # It's possible that one of the regexes
1447 # matched, but returned an empty group:
1448 if mobj.group(1) is None:
1449 self._downloader.report_error(u'Invalid URL: %s' % url)
1452 video_url = compat_urllib_parse.unquote(mobj.group(1))
1453 video_id = os.path.basename(video_url)
1455 # here's a fun little line of code for you:
1456 video_extension = os.path.splitext(video_id)[1][1:]
1457 video_id = os.path.splitext(video_id)[0]
1459 # it's tempting to parse this further, but you would
1460 # have to take into account all the variations like
1461 # Video Title - Site Name
1462 # Site Name | Video Title
1463 # Video Title - Tagline | Site Name
1464 # and so on and so forth; it's just not practical
1465 mobj = re.search(r'<title>(.*)</title>', webpage)
1467 self._downloader.report_error(u'unable to extract title')
1469 video_title = mobj.group(1)
1471 # video uploader is domain name
1472 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1474 self._downloader.report_error(u'unable to extract title')
1476 video_uploader = mobj.group(1)
1481 'uploader': video_uploader,
1482 'upload_date': None,
1483 'title': video_title,
1484 'ext': video_extension,
1488 class YoutubeSearchIE(InfoExtractor):
1489 """Information Extractor for YouTube search queries."""
1490 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1491 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1492 _max_youtube_results = 1000
1493 IE_NAME = u'youtube:search'
1495 def __init__(self, downloader=None):
1496 InfoExtractor.__init__(self, downloader)
1498 def report_download_page(self, query, pagenum):
1499 """Report attempt to download search page with given number."""
1500 query = query.decode(preferredencoding())
1501 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1503 def _real_extract(self, query):
1504 mobj = re.match(self._VALID_URL, query)
1506 self._downloader.report_error(u'invalid search query "%s"' % query)
1509 prefix, query = query.split(':')
1511 query = query.encode('utf-8')
1513 self._download_n_results(query, 1)
1515 elif prefix == 'all':
1516 self._download_n_results(query, self._max_youtube_results)
1522 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1524 elif n > self._max_youtube_results:
1525 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1526 n = self._max_youtube_results
1527 self._download_n_results(query, n)
1529 except ValueError: # parsing prefix as integer fails
1530 self._download_n_results(query, 1)
1533 def _download_n_results(self, query, n):
1534 """Downloads a specified number of results for a query"""
1540 while (50 * pagenum) < limit:
1541 self.report_download_page(query, pagenum+1)
1542 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1543 request = compat_urllib_request.Request(result_url)
1545 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1546 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1547 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1549 api_response = json.loads(data)['data']
1551 if not 'items' in api_response:
1552 self._downloader.trouble(u'[youtube] No video results')
1555 new_ids = list(video['id'] for video in api_response['items'])
1556 video_ids += new_ids
1558 limit = min(n, api_response['totalItems'])
1561 if len(video_ids) > n:
1562 video_ids = video_ids[:n]
1563 for id in video_ids:
1564 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1568 class GoogleSearchIE(InfoExtractor):
1569 """Information Extractor for Google Video search queries."""
1570 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1571 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1572 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1573 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1574 _max_google_results = 1000
1575 IE_NAME = u'video.google:search'
1577 def __init__(self, downloader=None):
1578 InfoExtractor.__init__(self, downloader)
1580 def report_download_page(self, query, pagenum):
1581 """Report attempt to download playlist page with given number."""
1582 query = query.decode(preferredencoding())
1583 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1585 def _real_extract(self, query):
1586 mobj = re.match(self._VALID_URL, query)
1588 self._downloader.report_error(u'invalid search query "%s"' % query)
1591 prefix, query = query.split(':')
1593 query = query.encode('utf-8')
1595 self._download_n_results(query, 1)
1597 elif prefix == 'all':
1598 self._download_n_results(query, self._max_google_results)
1604 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1606 elif n > self._max_google_results:
1607 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1608 n = self._max_google_results
1609 self._download_n_results(query, n)
1611 except ValueError: # parsing prefix as integer fails
1612 self._download_n_results(query, 1)
1615 def _download_n_results(self, query, n):
1616 """Downloads a specified number of results for a query"""
1622 self.report_download_page(query, pagenum)
1623 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1624 request = compat_urllib_request.Request(result_url)
1626 page = compat_urllib_request.urlopen(request).read()
1627 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1628 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1631 # Extract video identifiers
1632 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1633 video_id = mobj.group(1)
1634 if video_id not in video_ids:
1635 video_ids.append(video_id)
1636 if len(video_ids) == n:
1637 # Specified n videos reached
1638 for id in video_ids:
1639 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1642 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1643 for id in video_ids:
1644 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1647 pagenum = pagenum + 1
1650 class YahooSearchIE(InfoExtractor):
1651 """Information Extractor for Yahoo! Video search queries."""
1654 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1655 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1656 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1657 _MORE_PAGES_INDICATOR = r'\s*Next'
1658 _max_yahoo_results = 1000
1659 IE_NAME = u'video.yahoo:search'
1661 def __init__(self, downloader=None):
1662 InfoExtractor.__init__(self, downloader)
1664 def report_download_page(self, query, pagenum):
1665 """Report attempt to download playlist page with given number."""
1666 query = query.decode(preferredencoding())
1667 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1669 def _real_extract(self, query):
1670 mobj = re.match(self._VALID_URL, query)
1672 self._downloader.report_error(u'invalid search query "%s"' % query)
1675 prefix, query = query.split(':')
1677 query = query.encode('utf-8')
1679 self._download_n_results(query, 1)
1681 elif prefix == 'all':
1682 self._download_n_results(query, self._max_yahoo_results)
1688 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1690 elif n > self._max_yahoo_results:
1691 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1692 n = self._max_yahoo_results
1693 self._download_n_results(query, n)
1695 except ValueError: # parsing prefix as integer fails
1696 self._download_n_results(query, 1)
1699 def _download_n_results(self, query, n):
1700 """Downloads a specified number of results for a query"""
1703 already_seen = set()
1707 self.report_download_page(query, pagenum)
1708 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1709 request = compat_urllib_request.Request(result_url)
1711 page = compat_urllib_request.urlopen(request).read()
1712 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1713 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1716 # Extract video identifiers
1717 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1718 video_id = mobj.group(1)
1719 if video_id not in already_seen:
1720 video_ids.append(video_id)
1721 already_seen.add(video_id)
1722 if len(video_ids) == n:
1723 # Specified n videos reached
1724 for id in video_ids:
1725 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1728 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1729 for id in video_ids:
1730 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1733 pagenum = pagenum + 1
1736 class YoutubePlaylistIE(InfoExtractor):
1737 """Information Extractor for YouTube playlists."""
1739 _VALID_URL = r"""(?:
1744 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1745 \? (?:.*?&)*? (?:p|a|list)=
1748 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1751 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1753 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1755 IE_NAME = u'youtube:playlist'
1757 def __init__(self, downloader=None):
1758 InfoExtractor.__init__(self, downloader)
1761 def suitable(cls, url):
1762 """Receives a URL and returns True if suitable for this IE."""
1763 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1765 def report_download_page(self, playlist_id, pagenum):
1766 """Report attempt to download playlist page with given number."""
1767 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1769 def _real_extract(self, url):
1770 # Extract playlist id
1771 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1773 self._downloader.report_error(u'invalid url: %s' % url)
1776 # Download playlist videos from API
1777 playlist_id = mobj.group(1) or mobj.group(2)
1782 self.report_download_page(playlist_id, page_num)
1784 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1786 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1787 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1788 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1792 response = json.loads(page)
1793 except ValueError as err:
1794 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1797 if 'feed' not in response:
1798 self._downloader.report_error(u'Got a malformed response from YouTube API')
1800 if 'entry' not in response['feed']:
1801 # Number of videos is a multiple of self._MAX_RESULTS
1804 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1805 for entry in response['feed']['entry']
1806 if 'content' in entry ]
1808 if len(response['feed']['entry']) < self._MAX_RESULTS:
1812 videos = [v[1] for v in sorted(videos)]
1814 url_results = [self.url_result(url) for url in videos]
1815 return [self.playlist_result(url_results, playlist_id)]
1818 class YoutubeChannelIE(InfoExtractor):
1819 """Information Extractor for YouTube channels."""
1821 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1822 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1823 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1824 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1825 IE_NAME = u'youtube:channel'
1827 def report_download_page(self, channel_id, pagenum):
1828 """Report attempt to download channel page with given number."""
1829 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1831 def extract_videos_from_page(self, page):
1833 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1834 if mobj.group(1) not in ids_in_page:
1835 ids_in_page.append(mobj.group(1))
1838 def _real_extract(self, url):
1839 # Extract channel id
1840 mobj = re.match(self._VALID_URL, url)
1842 self._downloader.report_error(u'invalid url: %s' % url)
1845 # Download channel page
1846 channel_id = mobj.group(1)
1850 self.report_download_page(channel_id, pagenum)
1851 url = self._TEMPLATE_URL % (channel_id, pagenum)
1852 request = compat_urllib_request.Request(url)
1854 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1855 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1856 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1859 # Extract video identifiers
1860 ids_in_page = self.extract_videos_from_page(page)
1861 video_ids.extend(ids_in_page)
1863 # Download any subsequent channel pages using the json-based channel_ajax query
1864 if self._MORE_PAGES_INDICATOR in page:
1866 pagenum = pagenum + 1
1868 self.report_download_page(channel_id, pagenum)
1869 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1870 request = compat_urllib_request.Request(url)
1872 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1873 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1874 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1877 page = json.loads(page)
1879 ids_in_page = self.extract_videos_from_page(page['content_html'])
1880 video_ids.extend(ids_in_page)
1882 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1885 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1887 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1888 url_entries = [self.url_result(url) for url in urls]
1889 return [self.playlist_result(url_entries, channel_id)]
1892 class YoutubeUserIE(InfoExtractor):
1893 """Information Extractor for YouTube users."""
1895 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1896 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1897 _GDATA_PAGE_SIZE = 50
1898 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1899 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1900 IE_NAME = u'youtube:user'
1902 def __init__(self, downloader=None):
1903 InfoExtractor.__init__(self, downloader)
1905 def report_download_page(self, username, start_index):
1906 """Report attempt to download user page."""
1907 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1908 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1910 def _real_extract(self, url):
1912 mobj = re.match(self._VALID_URL, url)
1914 self._downloader.report_error(u'invalid url: %s' % url)
1917 username = mobj.group(1)
1919 # Download video ids using YouTube Data API. Result size per
1920 # query is limited (currently to 50 videos) so we need to query
1921 # page by page until there are no video ids - it means we got
1928 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1929 self.report_download_page(username, start_index)
1931 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1934 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1935 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1936 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1939 # Extract video identifiers
1942 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1943 if mobj.group(1) not in ids_in_page:
1944 ids_in_page.append(mobj.group(1))
1946 video_ids.extend(ids_in_page)
1948 # A little optimization - if current page is not
1949 # "full", ie. does not contain PAGE_SIZE video ids then
1950 # we can assume that this page is the last one - there
1951 # are no more ids on further pages - no need to query
1954 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1959 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1960 url_results = [self.url_result(url) for url in urls]
1961 return [self.playlist_result(url_results, playlist_title = username)]
1964 class BlipTVUserIE(InfoExtractor):
1965 """Information Extractor for blip.tv users."""
1967 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1969 IE_NAME = u'blip.tv:user'
1971 def __init__(self, downloader=None):
1972 InfoExtractor.__init__(self, downloader)
1974 def report_download_page(self, username, pagenum):
1975 """Report attempt to download user page."""
1976 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1977 (self.IE_NAME, username, pagenum))
1979 def _real_extract(self, url):
1981 mobj = re.match(self._VALID_URL, url)
1983 self._downloader.report_error(u'invalid url: %s' % url)
1986 username = mobj.group(1)
1988 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1990 request = compat_urllib_request.Request(url)
1993 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1994 mobj = re.search(r'data-users-id="([^"]+)"', page)
1995 page_base = page_base % mobj.group(1)
1996 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1997 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2001 # Download video ids using BlipTV Ajax calls. Result size per
2002 # query is limited (currently to 12 videos) so we need to query
2003 # page by page until there are no video ids - it means we got
2010 self.report_download_page(username, pagenum)
2011 url = page_base + "&page=" + str(pagenum)
2012 request = compat_urllib_request.Request( url )
2014 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
2015 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2016 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
2019 # Extract video identifiers
2022 for mobj in re.finditer(r'href="/([^"]+)"', page):
2023 if mobj.group(1) not in ids_in_page:
2024 ids_in_page.append(unescapeHTML(mobj.group(1)))
2026 video_ids.extend(ids_in_page)
2028 # A little optimization - if current page is not
2029 # "full", ie. does not contain PAGE_SIZE video ids then
2030 # we can assume that this page is the last one - there
2031 # are no more ids on further pages - no need to query
2034 if len(ids_in_page) < self._PAGE_SIZE:
2039 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
2040 (self.IE_NAME, username, all_ids_count, len(video_ids)))
2042 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
2043 url_entries = [self.url_result(url) for url in urls]
2044 return [self.playlist_result(url_entries, playlist_title = username)]
2047 class DepositFilesIE(InfoExtractor):
2048 """Information extractor for depositfiles.com"""
2050 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2052 def report_download_webpage(self, file_id):
2053 """Report webpage download."""
2054 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2056 def report_extraction(self, file_id):
2057 """Report information extraction."""
2058 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2060 def _real_extract(self, url):
2061 file_id = url.split('/')[-1]
2062 # Rebuild url in english locale
2063 url = 'http://depositfiles.com/en/files/' + file_id
2065 # Retrieve file webpage with 'Free download' button pressed
2066 free_download_indication = { 'gateway_result' : '1' }
2067 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2069 self.report_download_webpage(file_id)
2070 webpage = compat_urllib_request.urlopen(request).read()
2071 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2072 self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2075 # Search for the real file URL
2076 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2077 if (mobj is None) or (mobj.group(1) is None):
2078 # Try to figure out reason of the error.
2079 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2080 if (mobj is not None) and (mobj.group(1) is not None):
2081 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2082 self._downloader.report_error(u'%s' % restriction_message)
2084 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2087 file_url = mobj.group(1)
2088 file_extension = os.path.splitext(file_url)[1][1:]
2090 # Search for file title
2091 mobj = re.search(r'<b title="(.*?)">', webpage)
2093 self._downloader.report_error(u'unable to extract title')
2095 file_title = mobj.group(1).decode('utf-8')
2098 'id': file_id.decode('utf-8'),
2099 'url': file_url.decode('utf-8'),
2101 'upload_date': None,
2102 'title': file_title,
2103 'ext': file_extension.decode('utf-8'),
2107 class FacebookIE(InfoExtractor):
2108 """Information Extractor for Facebook"""
2110 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2111 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2112 _NETRC_MACHINE = 'facebook'
2113 IE_NAME = u'facebook'
2115 def report_login(self):
2116 """Report attempt to log in."""
2117 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2119 def _real_initialize(self):
2120 if self._downloader is None:
2125 downloader_params = self._downloader.params
2127 # Attempt to use provided username and password or .netrc data
2128 if downloader_params.get('username', None) is not None:
2129 useremail = downloader_params['username']
2130 password = downloader_params['password']
2131 elif downloader_params.get('usenetrc', False):
2133 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2134 if info is not None:
2138 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2139 except (IOError, netrc.NetrcParseError) as err:
2140 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2143 if useremail is None:
2152 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2155 login_results = compat_urllib_request.urlopen(request).read()
2156 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2157 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2159 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2160 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2163 def _real_extract(self, url):
2164 mobj = re.match(self._VALID_URL, url)
2166 self._downloader.report_error(u'invalid URL: %s' % url)
2168 video_id = mobj.group('ID')
2170 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2171 webpage = self._download_webpage(url, video_id)
2173 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2174 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2175 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2177 raise ExtractorError(u'Cannot parse data')
2178 data = dict(json.loads(m.group(1)))
2179 params_raw = compat_urllib_parse.unquote(data['params'])
2180 params = json.loads(params_raw)
2181 video_data = params['video_data'][0]
2182 video_url = video_data.get('hd_src')
2184 video_url = video_data['sd_src']
2186 raise ExtractorError(u'Cannot find video URL')
2187 video_duration = int(video_data['video_duration'])
2188 thumbnail = video_data['thumbnail_src']
2190 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2192 raise ExtractorError(u'Cannot find title in webpage')
2193 video_title = unescapeHTML(m.group(1))
2197 'title': video_title,
2200 'duration': video_duration,
2201 'thumbnail': thumbnail,
2206 class BlipTVIE(InfoExtractor):
2207 """Information extractor for blip.tv"""
2209 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2210 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2211 IE_NAME = u'blip.tv'
2213 def report_extraction(self, file_id):
2214 """Report information extraction."""
2215 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2217 def report_direct_download(self, title):
2218 """Report information extraction."""
2219 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2221 def _real_extract(self, url):
2222 mobj = re.match(self._VALID_URL, url)
2224 self._downloader.report_error(u'invalid URL: %s' % url)
2227 urlp = compat_urllib_parse_urlparse(url)
2228 if urlp.path.startswith('/play/'):
2229 request = compat_urllib_request.Request(url)
2230 response = compat_urllib_request.urlopen(request)
2231 redirecturl = response.geturl()
2232 rurlp = compat_urllib_parse_urlparse(redirecturl)
2233 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2234 url = 'http://blip.tv/a/a-' + file_id
2235 return self._real_extract(url)
2242 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2243 request = compat_urllib_request.Request(json_url)
2244 request.add_header('User-Agent', 'iTunes/10.6.1')
2245 self.report_extraction(mobj.group(1))
2248 urlh = compat_urllib_request.urlopen(request)
2249 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2250 basename = url.split('/')[-1]
2251 title,ext = os.path.splitext(basename)
2252 title = title.decode('UTF-8')
2253 ext = ext.replace('.', '')
2254 self.report_direct_download(title)
2259 'upload_date': None,
2264 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2265 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2266 if info is None: # Regular URL
2268 json_code_bytes = urlh.read()
2269 json_code = json_code_bytes.decode('utf-8')
2270 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2271 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2275 json_data = json.loads(json_code)
2276 if 'Post' in json_data:
2277 data = json_data['Post']
2281 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2282 video_url = data['media']['url']
2283 umobj = re.match(self._URL_EXT, video_url)
2285 raise ValueError('Can not determine filename extension')
2286 ext = umobj.group(1)
2289 'id': data['item_id'],
2291 'uploader': data['display_name'],
2292 'upload_date': upload_date,
2293 'title': data['title'],
2295 'format': data['media']['mimeType'],
2296 'thumbnail': data['thumbnailUrl'],
2297 'description': data['description'],
2298 'player_url': data['embedUrl'],
2299 'user_agent': 'iTunes/10.6.1',
2301 except (ValueError,KeyError) as err:
2302 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2308 class MyVideoIE(InfoExtractor):
2309 """Information Extractor for myvideo.de."""
2311 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2312 IE_NAME = u'myvideo'
2314 def __init__(self, downloader=None):
2315 InfoExtractor.__init__(self, downloader)
2317 def report_extraction(self, video_id):
2318 """Report information extraction."""
2319 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2321 def _real_extract(self,url):
2322 mobj = re.match(self._VALID_URL, url)
2324 self._download.report_error(u'invalid URL: %s' % url)
2327 video_id = mobj.group(1)
2330 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2331 webpage = self._download_webpage(webpage_url, video_id)
2333 self.report_extraction(video_id)
2334 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2337 self._downloader.report_error(u'unable to extract media URL')
2339 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2341 mobj = re.search('<title>([^<]+)</title>', webpage)
2343 self._downloader.report_error(u'unable to extract title')
2346 video_title = mobj.group(1)
2352 'upload_date': None,
2353 'title': video_title,
2357 class ComedyCentralIE(InfoExtractor):
2358 """Information extractor for The Daily Show and Colbert Report """
2360 # urls can be abbreviations like :thedailyshow or :colbert
2361 # urls for episodes like:
2362 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2363 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2364 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2365 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2366 |(https?://)?(www\.)?
2367 (?P<showname>thedailyshow|colbertnation)\.com/
2368 (full-episodes/(?P<episode>.*)|
2370 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2371 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2374 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2376 _video_extensions = {
2384 _video_dimensions = {
2394 def suitable(cls, url):
2395 """Receives a URL and returns True if suitable for this IE."""
2396 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2398 def report_extraction(self, episode_id):
2399 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2401 def report_config_download(self, episode_id, media_id):
2402 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2404 def report_index_download(self, episode_id):
2405 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2407 def _print_formats(self, formats):
2408 print('Available formats:')
2410 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2413 def _real_extract(self, url):
2414 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2416 self._downloader.report_error(u'invalid URL: %s' % url)
2419 if mobj.group('shortname'):
2420 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2421 url = u'http://www.thedailyshow.com/full-episodes/'
2423 url = u'http://www.colbertnation.com/full-episodes/'
2424 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2425 assert mobj is not None
2427 if mobj.group('clip'):
2428 if mobj.group('showname') == 'thedailyshow':
2429 epTitle = mobj.group('tdstitle')
2431 epTitle = mobj.group('cntitle')
2434 dlNewest = not mobj.group('episode')
2436 epTitle = mobj.group('showname')
2438 epTitle = mobj.group('episode')
2440 req = compat_urllib_request.Request(url)
2441 self.report_extraction(epTitle)
2443 htmlHandle = compat_urllib_request.urlopen(req)
2444 html = htmlHandle.read()
2445 webpage = html.decode('utf-8')
2446 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2447 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2450 url = htmlHandle.geturl()
2451 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2453 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2455 if mobj.group('episode') == '':
2456 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2458 epTitle = mobj.group('episode')
2460 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2462 if len(mMovieParams) == 0:
2463 # The Colbert Report embeds the information in a without
2464 # a URL prefix; so extract the alternate reference
2465 # and then add the URL prefix manually.
2467 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2468 if len(altMovieParams) == 0:
2469 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2472 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2474 uri = mMovieParams[0][1]
2475 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2476 self.report_index_download(epTitle)
2478 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2479 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2480 self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2485 idoc = xml.etree.ElementTree.fromstring(indexXml)
2486 itemEls = idoc.findall('.//item')
2487 for partNum,itemEl in enumerate(itemEls):
2488 mediaId = itemEl.findall('./guid')[0].text
2489 shortMediaId = mediaId.split(':')[-1]
2490 showId = mediaId.split(':')[-2].replace('.com', '')
2491 officialTitle = itemEl.findall('./title')[0].text
2492 officialDate = itemEl.findall('./pubDate')[0].text
2494 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2495 compat_urllib_parse.urlencode({'uri': mediaId}))
2496 configReq = compat_urllib_request.Request(configUrl)
2497 self.report_config_download(epTitle, shortMediaId)
2499 configXml = compat_urllib_request.urlopen(configReq).read()
2500 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2501 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2504 cdoc = xml.etree.ElementTree.fromstring(configXml)
2506 for rendition in cdoc.findall('.//rendition'):
2507 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2511 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2514 if self._downloader.params.get('listformats', None):
2515 self._print_formats([i[0] for i in turls])
2518 # For now, just pick the highest bitrate
2519 format,rtmp_video_url = turls[-1]
2521 # Get the format arg from the arg stream
2522 req_format = self._downloader.params.get('format', None)
2524 # Select format if we can find one
2527 format, rtmp_video_url = f, v
2530 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2532 raise ExtractorError(u'Cannot transform RTMP url')
2533 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2534 video_url = base + m.group('finalid')
2536 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2541 'upload_date': officialDate,
2546 'description': officialTitle,
2548 results.append(info)
2553 class EscapistIE(InfoExtractor):
2554 """Information extractor for The Escapist """
2556 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2557 IE_NAME = u'escapist'
2559 def report_extraction(self, showName):
2560 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2562 def report_config_download(self, showName):
2563 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2565 def _real_extract(self, url):
2566 mobj = re.match(self._VALID_URL, url)
2568 self._downloader.report_error(u'invalid URL: %s' % url)
2570 showName = mobj.group('showname')
2571 videoId = mobj.group('episode')
2573 self.report_extraction(showName)
2575 webPage = compat_urllib_request.urlopen(url)
2576 webPageBytes = webPage.read()
2577 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2578 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2579 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2580 self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2583 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2584 description = unescapeHTML(descMatch.group(1))
2585 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2586 imgUrl = unescapeHTML(imgMatch.group(1))
2587 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2588 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2589 configUrlMatch = re.search('config=(.*)$', playerUrl)
2590 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2592 self.report_config_download(showName)
2594 configJSON = compat_urllib_request.urlopen(configUrl)
2595 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2596 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2597 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2598 self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2601 # Technically, it's JavaScript, not JSON
2602 configJSON = configJSON.replace("'", '"')
2605 config = json.loads(configJSON)
2606 except (ValueError,) as err:
2607 self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2610 playlist = config['playlist']
2611 videoUrl = playlist[1]['url']
2616 'uploader': showName,
2617 'upload_date': None,
2620 'thumbnail': imgUrl,
2621 'description': description,
2622 'player_url': playerUrl,
2627 class CollegeHumorIE(InfoExtractor):
2628 """Information extractor for collegehumor.com"""
2631 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2632 IE_NAME = u'collegehumor'
2634 def report_manifest(self, video_id):
2635 """Report information extraction."""
2636 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2638 def report_extraction(self, video_id):
2639 """Report information extraction."""
2640 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2642 def _real_extract(self, url):
2643 mobj = re.match(self._VALID_URL, url)
2645 self._downloader.report_error(u'invalid URL: %s' % url)
2647 video_id = mobj.group('videoid')
2652 'upload_date': None,
2655 self.report_extraction(video_id)
2656 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2658 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2659 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2660 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2663 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2665 videoNode = mdoc.findall('./video')[0]
2666 info['description'] = videoNode.findall('./description')[0].text
2667 info['title'] = videoNode.findall('./caption')[0].text
2668 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2669 manifest_url = videoNode.findall('./file')[0].text
2671 self._downloader.report_error(u'Invalid metadata XML file')
2674 manifest_url += '?hdcore=2.10.3'
2675 self.report_manifest(video_id)
2677 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2678 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2679 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2682 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2684 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2685 node_id = media_node.attrib['url']
2686 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2687 except IndexError as err:
2688 self._downloader.report_error(u'Invalid manifest file')
2691 url_pr = compat_urllib_parse_urlparse(manifest_url)
2692 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2699 class XVideosIE(InfoExtractor):
2700 """Information extractor for xvideos.com"""
2702 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2703 IE_NAME = u'xvideos'
2705 def report_extraction(self, video_id):
2706 """Report information extraction."""
2707 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2709 def _real_extract(self, url):
2710 mobj = re.match(self._VALID_URL, url)
2712 self._downloader.report_error(u'invalid URL: %s' % url)
2714 video_id = mobj.group(1)
2716 webpage = self._download_webpage(url, video_id)
2718 self.report_extraction(video_id)
2722 mobj = re.search(r'flv_url=(.+?)&', webpage)
2724 self._downloader.report_error(u'unable to extract video url')
2726 video_url = compat_urllib_parse.unquote(mobj.group(1))
2730 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2732 self._downloader.report_error(u'unable to extract video title')
2734 video_title = mobj.group(1)
2737 # Extract video thumbnail
2738 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2740 self._downloader.report_error(u'unable to extract video thumbnail')
2742 video_thumbnail = mobj.group(0)
2748 'upload_date': None,
2749 'title': video_title,
2751 'thumbnail': video_thumbnail,
2752 'description': None,
2758 class SoundcloudIE(InfoExtractor):
2759 """Information extractor for soundcloud.com
2760 To access the media, the uid of the song and a stream token
2761 must be extracted from the page source and the script must make
2762 a request to media.soundcloud.com/crossdomain.xml. Then
2763 the media can be grabbed by requesting from an url composed
2764 of the stream token and uid
2767 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2768 IE_NAME = u'soundcloud'
2770 def __init__(self, downloader=None):
2771 InfoExtractor.__init__(self, downloader)
2773 def report_resolve(self, video_id):
2774 """Report information extraction."""
2775 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2777 def report_extraction(self, video_id):
2778 """Report information extraction."""
2779 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2781 def _real_extract(self, url):
2782 mobj = re.match(self._VALID_URL, url)
2784 self._downloader.report_error(u'invalid URL: %s' % url)
2787 # extract uploader (which is in the url)
2788 uploader = mobj.group(1)
2789 # extract simple title (uploader + slug of song title)
2790 slug_title = mobj.group(2)
2791 simple_title = uploader + u'-' + slug_title
2793 self.report_resolve('%s/%s' % (uploader, slug_title))
2795 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2796 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2797 request = compat_urllib_request.Request(resolv_url)
2799 info_json_bytes = compat_urllib_request.urlopen(request).read()
2800 info_json = info_json_bytes.decode('utf-8')
2801 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2802 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2805 info = json.loads(info_json)
2806 video_id = info['id']
2807 self.report_extraction('%s/%s' % (uploader, slug_title))
2809 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2810 request = compat_urllib_request.Request(streams_url)
2812 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2813 stream_json = stream_json_bytes.decode('utf-8')
2814 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2815 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2818 streams = json.loads(stream_json)
2819 mediaURL = streams['http_mp3_128_url']
2824 'uploader': info['user']['username'],
2825 'upload_date': info['created_at'],
2826 'title': info['title'],
2828 'description': info['description'],
2831 class SoundcloudSetIE(InfoExtractor):
2832 """Information extractor for soundcloud.com sets
2833 To access the media, the uid of the song and a stream token
2834 must be extracted from the page source and the script must make
2835 a request to media.soundcloud.com/crossdomain.xml. Then
2836 the media can be grabbed by requesting from an url composed
2837 of the stream token and uid
2840 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2841 IE_NAME = u'soundcloud'
2843 def __init__(self, downloader=None):
2844 InfoExtractor.__init__(self, downloader)
2846 def report_resolve(self, video_id):
2847 """Report information extraction."""
2848 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2850 def report_extraction(self, video_id):
2851 """Report information extraction."""
2852 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2854 def _real_extract(self, url):
2855 mobj = re.match(self._VALID_URL, url)
2857 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2860 # extract uploader (which is in the url)
2861 uploader = mobj.group(1)
2862 # extract simple title (uploader + slug of song title)
2863 slug_title = mobj.group(2)
2864 simple_title = uploader + u'-' + slug_title
2866 self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2868 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2869 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2870 request = compat_urllib_request.Request(resolv_url)
2872 info_json_bytes = compat_urllib_request.urlopen(request).read()
2873 info_json = info_json_bytes.decode('utf-8')
2874 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2875 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2879 info = json.loads(info_json)
2880 if 'errors' in info:
2881 for err in info['errors']:
2882 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2885 for track in info['tracks']:
2886 video_id = track['id']
2887 self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2889 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2890 request = compat_urllib_request.Request(streams_url)
2892 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2893 stream_json = stream_json_bytes.decode('utf-8')
2894 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2895 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2898 streams = json.loads(stream_json)
2899 mediaURL = streams['http_mp3_128_url']
2904 'uploader': track['user']['username'],
2905 'upload_date': track['created_at'],
2906 'title': track['title'],
2908 'description': track['description'],
2913 class InfoQIE(InfoExtractor):
2914 """Information extractor for infoq.com"""
2915 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2917 def report_extraction(self, video_id):
2918 """Report information extraction."""
2919 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2921 def _real_extract(self, url):
2922 mobj = re.match(self._VALID_URL, url)
2924 self._downloader.report_error(u'invalid URL: %s' % url)
2927 webpage = self._download_webpage(url, video_id=url)
2928 self.report_extraction(url)
2931 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2933 self._downloader.report_error(u'unable to extract video url')
2935 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2936 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2939 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2941 self._downloader.report_error(u'unable to extract video title')
2943 video_title = mobj.group(1)
2945 # Extract description
2946 video_description = u'No description available.'
2947 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2948 if mobj is not None:
2949 video_description = mobj.group(1)
2951 video_filename = video_url.split('/')[-1]
2952 video_id, extension = video_filename.split('.')
2958 'upload_date': None,
2959 'title': video_title,
2960 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2962 'description': video_description,
2967 class MixcloudIE(InfoExtractor):
2968 """Information extractor for www.mixcloud.com"""
2970 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2971 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2972 IE_NAME = u'mixcloud'
2974 def __init__(self, downloader=None):
2975 InfoExtractor.__init__(self, downloader)
2977 def report_download_json(self, file_id):
2978 """Report JSON download."""
2979 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2981 def report_extraction(self, file_id):
2982 """Report information extraction."""
2983 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2985 def get_urls(self, jsonData, fmt, bitrate='best'):
2986 """Get urls from 'audio_formats' section in json"""
2989 bitrate_list = jsonData[fmt]
2990 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2991 bitrate = max(bitrate_list) # select highest
2993 url_list = jsonData[fmt][bitrate]
2994 except TypeError: # we have no bitrate info.
2995 url_list = jsonData[fmt]
2998 def check_urls(self, url_list):
2999 """Returns 1st active url from list"""
3000 for url in url_list:
3002 compat_urllib_request.urlopen(url)
3004 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3009 def _print_formats(self, formats):
3010 print('Available formats:')
3011 for fmt in formats.keys():
3012 for b in formats[fmt]:
3014 ext = formats[fmt][b][0]
3015 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
3016 except TypeError: # we have no bitrate info
3017 ext = formats[fmt][0]
3018 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3021 def _real_extract(self, url):
3022 mobj = re.match(self._VALID_URL, url)
3024 self._downloader.report_error(u'invalid URL: %s' % url)
3026 # extract uploader & filename from url
3027 uploader = mobj.group(1).decode('utf-8')
3028 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3030 # construct API request
3031 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3032 # retrieve .json file with links to files
3033 request = compat_urllib_request.Request(file_url)
3035 self.report_download_json(file_url)
3036 jsonData = compat_urllib_request.urlopen(request).read()
3037 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3038 self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
3042 json_data = json.loads(jsonData)
3043 player_url = json_data['player_swf_url']
3044 formats = dict(json_data['audio_formats'])
3046 req_format = self._downloader.params.get('format', None)
3049 if self._downloader.params.get('listformats', None):
3050 self._print_formats(formats)
3053 if req_format is None or req_format == 'best':
3054 for format_param in formats.keys():
3055 url_list = self.get_urls(formats, format_param)
3057 file_url = self.check_urls(url_list)
3058 if file_url is not None:
3061 if req_format not in formats:
3062 self._downloader.report_error(u'format is not available')
3065 url_list = self.get_urls(formats, req_format)
3066 file_url = self.check_urls(url_list)
3067 format_param = req_format
3070 'id': file_id.decode('utf-8'),
3071 'url': file_url.decode('utf-8'),
3072 'uploader': uploader.decode('utf-8'),
3073 'upload_date': None,
3074 'title': json_data['name'],
3075 'ext': file_url.split('.')[-1].decode('utf-8'),
3076 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3077 'thumbnail': json_data['thumbnail_url'],
3078 'description': json_data['description'],
3079 'player_url': player_url.decode('utf-8'),
3082 class StanfordOpenClassroomIE(InfoExtractor):
3083 """Information extractor for Stanford's Open ClassRoom"""
3085 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3086 IE_NAME = u'stanfordoc'
3088 def report_download_webpage(self, objid):
3089 """Report information extraction."""
3090 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3092 def report_extraction(self, video_id):
3093 """Report information extraction."""
3094 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3096 def _real_extract(self, url):
3097 mobj = re.match(self._VALID_URL, url)
3099 raise ExtractorError(u'Invalid URL: %s' % url)
3101 if mobj.group('course') and mobj.group('video'): # A specific video
3102 course = mobj.group('course')
3103 video = mobj.group('video')
3105 'id': course + '_' + video,
3107 'upload_date': None,
3110 self.report_extraction(info['id'])
3111 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3112 xmlUrl = baseUrl + video + '.xml'
3114 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3115 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3116 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3118 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3120 info['title'] = mdoc.findall('./title')[0].text
3121 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3123 self._downloader.report_error(u'Invalid metadata XML file')
3125 info['ext'] = info['url'].rpartition('.')[2]
3127 elif mobj.group('course'): # A course page
3128 course = mobj.group('course')
3133 'upload_date': None,
3136 coursepage = self._download_webpage(url, info['id'],
3137 note='Downloading course info page',
3138 errnote='Unable to download course info page')
3140 m = re.search('<h1>([^<]+)</h1>', coursepage)
3142 info['title'] = unescapeHTML(m.group(1))
3144 info['title'] = info['id']
3146 m = re.search('<description>([^<]+)</description>', coursepage)
3148 info['description'] = unescapeHTML(m.group(1))
3150 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3153 'type': 'reference',
3154 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3158 for entry in info['list']:
3159 assert entry['type'] == 'reference'
3160 results += self.extract(entry['url'])
3164 'id': 'Stanford OpenClassroom',
3167 'upload_date': None,
3170 self.report_download_webpage(info['id'])
3171 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3173 rootpage = compat_urllib_request.urlopen(rootURL).read()
3174 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3175 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3178 info['title'] = info['id']
3180 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3183 'type': 'reference',
3184 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3189 for entry in info['list']:
3190 assert entry['type'] == 'reference'
3191 results += self.extract(entry['url'])
3194 class MTVIE(InfoExtractor):
3195 """Information extractor for MTV.com"""
3197 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3200 def report_extraction(self, video_id):
3201 """Report information extraction."""
3202 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3204 def _real_extract(self, url):
3205 mobj = re.match(self._VALID_URL, url)
3207 self._downloader.report_error(u'invalid URL: %s' % url)
3209 if not mobj.group('proto'):
3210 url = 'http://' + url
3211 video_id = mobj.group('videoid')
3213 webpage = self._download_webpage(url, video_id)
3215 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3217 self._downloader.report_error(u'unable to extract song name')
3219 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3220 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3222 self._downloader.report_error(u'unable to extract performer')
3224 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3225 video_title = performer + ' - ' + song_name
3227 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3229 self._downloader.report_error(u'unable to mtvn_uri')
3231 mtvn_uri = mobj.group(1)
3233 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3235 self._downloader.report_error(u'unable to extract content id')
3237 content_id = mobj.group(1)
3239 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3240 self.report_extraction(video_id)
3241 request = compat_urllib_request.Request(videogen_url)
3243 metadataXml = compat_urllib_request.urlopen(request).read()
3244 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3245 self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3248 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3249 renditions = mdoc.findall('.//rendition')
3251 # For now, always pick the highest quality.
3252 rendition = renditions[-1]
3255 _,_,ext = rendition.attrib['type'].partition('/')
3256 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3257 video_url = rendition.find('./src').text
3259 self._downloader.trouble('Invalid rendition field.')
3265 'uploader': performer,
3266 'upload_date': None,
3267 'title': video_title,
3275 class YoukuIE(InfoExtractor):
3276 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3278 def report_download_webpage(self, file_id):
3279 """Report webpage download."""
3280 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3282 def report_extraction(self, file_id):
3283 """Report information extraction."""
3284 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3287 nowTime = int(time.time() * 1000)
3288 random1 = random.randint(1000,1998)
3289 random2 = random.randint(1000,9999)
3291 return "%d%d%d" %(nowTime,random1,random2)
3293 def _get_file_ID_mix_string(self, seed):
3295 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3297 for i in range(len(source)):
3298 seed = (seed * 211 + 30031 ) % 65536
3299 index = math.floor(seed / 65536 * len(source) )
3300 mixed.append(source[int(index)])
3301 source.remove(source[int(index)])
3302 #return ''.join(mixed)
3305 def _get_file_id(self, fileId, seed):
3306 mixed = self._get_file_ID_mix_string(seed)
3307 ids = fileId.split('*')
3311 realId.append(mixed[int(ch)])
3312 return ''.join(realId)
3314 def _real_extract(self, url):
3315 mobj = re.match(self._VALID_URL, url)
3317 self._downloader.report_error(u'invalid URL: %s' % url)
3319 video_id = mobj.group('ID')
3321 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3323 request = compat_urllib_request.Request(info_url, None, std_headers)
3325 self.report_download_webpage(video_id)
3326 jsondata = compat_urllib_request.urlopen(request).read()
3327 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3328 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3331 self.report_extraction(video_id)
3333 jsonstr = jsondata.decode('utf-8')
3334 config = json.loads(jsonstr)
3336 video_title = config['data'][0]['title']
3337 seed = config['data'][0]['seed']
3339 format = self._downloader.params.get('format', None)
3340 supported_format = list(config['data'][0]['streamfileids'].keys())
3342 if format is None or format == 'best':
3343 if 'hd2' in supported_format:
3348 elif format == 'worst':
3356 fileid = config['data'][0]['streamfileids'][format]
3357 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3358 except (UnicodeDecodeError, ValueError, KeyError):
3359 self._downloader.report_error(u'unable to extract info section')
3363 sid = self._gen_sid()
3364 fileid = self._get_file_id(fileid, seed)
3366 #column 8,9 of fileid represent the segment number
3367 #fileid[7:9] should be changed
3368 for index, key in enumerate(keys):
3370 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3371 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3374 'id': '%s_part%02d' % (video_id, index),
3375 'url': download_url,
3377 'upload_date': None,
3378 'title': video_title,
3381 files_info.append(info)
3386 class XNXXIE(InfoExtractor):
3387 """Information extractor for xnxx.com"""
3389 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3391 VIDEO_URL_RE = r'flv_url=(.*?)&'
3392 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3393 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3395 def report_webpage(self, video_id):
3396 """Report information extraction"""
3397 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3399 def report_extraction(self, video_id):
3400 """Report information extraction"""
3401 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3403 def _real_extract(self, url):
3404 mobj = re.match(self._VALID_URL, url)
3406 self._downloader.report_error(u'invalid URL: %s' % url)
3408 video_id = mobj.group(1)
3410 self.report_webpage(video_id)
3412 # Get webpage content
3414 webpage_bytes = compat_urllib_request.urlopen(url).read()
3415 webpage = webpage_bytes.decode('utf-8')
3416 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3417 self._downloader.report_error(u'unable to download video webpage: %s' % err)
3420 result = re.search(self.VIDEO_URL_RE, webpage)
3422 self._downloader.report_error(u'unable to extract video url')
3424 video_url = compat_urllib_parse.unquote(result.group(1))
3426 result = re.search(self.VIDEO_TITLE_RE, webpage)
3428 self._downloader.report_error(u'unable to extract video title')
3430 video_title = result.group(1)
3432 result = re.search(self.VIDEO_THUMB_RE, webpage)
3434 self._downloader.report_error(u'unable to extract video thumbnail')
3436 video_thumbnail = result.group(1)
3442 'upload_date': None,
3443 'title': video_title,
3445 'thumbnail': video_thumbnail,
3446 'description': None,
3450 class GooglePlusIE(InfoExtractor):
3451 """Information extractor for plus.google.com."""
3453 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3454 IE_NAME = u'plus.google'
3456 def __init__(self, downloader=None):
3457 InfoExtractor.__init__(self, downloader)
3459 def report_extract_entry(self, url):
3460 """Report downloading extry"""
3461 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3463 def report_date(self, upload_date):
3464 """Report downloading extry"""
3465 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3467 def report_uploader(self, uploader):
3468 """Report downloading extry"""
3469 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3471 def report_title(self, video_title):
3472 """Report downloading extry"""
3473 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3475 def report_extract_vid_page(self, video_page):
3476 """Report information extraction."""
3477 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3479 def _real_extract(self, url):
3480 # Extract id from URL
3481 mobj = re.match(self._VALID_URL, url)
3483 self._downloader.report_error(u'Invalid URL: %s' % url)
3486 post_url = mobj.group(0)
3487 video_id = mobj.group(1)
3489 video_extension = 'flv'
3491 # Step 1, Retrieve post webpage to extract further information
3492 self.report_extract_entry(post_url)
3493 request = compat_urllib_request.Request(post_url)
3495 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3496 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3497 self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3500 # Extract update date
3502 pattern = 'title="Timestamp">(.*?)</a>'
3503 mobj = re.search(pattern, webpage)
3505 upload_date = mobj.group(1)
3506 # Convert timestring to a format suitable for filename
3507 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3508 upload_date = upload_date.strftime('%Y%m%d')
3509 self.report_date(upload_date)
3513 pattern = r'rel\="author".*?>(.*?)</a>'
3514 mobj = re.search(pattern, webpage)
3516 uploader = mobj.group(1)
3517 self.report_uploader(uploader)
3520 # Get the first line for title
3522 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3523 mobj = re.search(pattern, webpage)
3525 video_title = mobj.group(1)
3526 self.report_title(video_title)
3528 # Step 2, Stimulate clicking the image box to launch video
3529 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3530 mobj = re.search(pattern, webpage)
3532 self._downloader.report_error(u'unable to extract video page URL')
3534 video_page = mobj.group(1)
3535 request = compat_urllib_request.Request(video_page)
3537 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3538 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3539 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3541 self.report_extract_vid_page(video_page)
3544 # Extract video links on video page
3545 """Extract video links of all sizes"""
3546 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3547 mobj = re.findall(pattern, webpage)
3549 self._downloader.report_error(u'unable to extract video links')
3551 # Sort in resolution
3552 links = sorted(mobj)
3554 # Choose the lowest of the sort, i.e. highest resolution
3555 video_url = links[-1]
3556 # Only get the url. The resolution part in the tuple has no use anymore
3557 video_url = video_url[-1]
3558 # Treat escaped \u0026 style hex
3560 video_url = video_url.decode("unicode_escape")
3561 except AttributeError: # Python 3
3562 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3568 'uploader': uploader,
3569 'upload_date': upload_date,
3570 'title': video_title,
3571 'ext': video_extension,
3574 class NBAIE(InfoExtractor):
3575 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3578 def _real_extract(self, url):
3579 mobj = re.match(self._VALID_URL, url)
3581 self._downloader.report_error(u'invalid URL: %s' % url)
3584 video_id = mobj.group(1)
3585 if video_id.endswith('/index.html'):
3586 video_id = video_id[:-len('/index.html')]
3588 webpage = self._download_webpage(url, video_id)
3590 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3591 def _findProp(rexp, default=None):
3592 m = re.search(rexp, webpage)
3594 return unescapeHTML(m.group(1))
3598 shortened_video_id = video_id.rpartition('/')[2]
3599 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3601 'id': shortened_video_id,
3605 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3606 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3610 class JustinTVIE(InfoExtractor):
3611 """Information extractor for justin.tv and twitch.tv"""
3612 # TODO: One broadcast may be split into multiple videos. The key
3613 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3614 # starts at 1 and increases. Can we treat all parts as one video?
3616 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3617 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3618 _JUSTIN_PAGE_LIMIT = 100
3619 IE_NAME = u'justin.tv'
3621 def report_extraction(self, file_id):
3622 """Report information extraction."""
3623 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3625 def report_download_page(self, channel, offset):
3626 """Report attempt to download a single page of videos."""
3627 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3628 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3630 # Return count of items, list of *valid* items
3631 def _parse_page(self, url):
3633 urlh = compat_urllib_request.urlopen(url)
3634 webpage_bytes = urlh.read()
3635 webpage = webpage_bytes.decode('utf-8', 'ignore')
3636 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3637 self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3640 response = json.loads(webpage)
3641 if type(response) != list:
3642 error_text = response.get('error', 'unknown error')
3643 self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3646 for clip in response:
3647 video_url = clip['video_file_url']
3649 video_extension = os.path.splitext(video_url)[1][1:]
3650 video_date = re.sub('-', '', clip['start_time'][:10])
3651 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3652 video_id = clip['id']
3653 video_title = clip.get('title', video_id)
3657 'title': video_title,
3658 'uploader': clip.get('channel_name', video_uploader_id),
3659 'uploader_id': video_uploader_id,
3660 'upload_date': video_date,
3661 'ext': video_extension,
3663 return (len(response), info)
3665 def _real_extract(self, url):
3666 mobj = re.match(self._VALID_URL, url)
3668 self._downloader.report_error(u'invalid URL: %s' % url)
3671 api = 'http://api.justin.tv'
3672 video_id = mobj.group(mobj.lastindex)
3674 if mobj.lastindex == 1:
3676 api += '/channel/archives/%s.json'
3678 api += '/broadcast/by_archive/%s.json'
3679 api = api % (video_id,)
3681 self.report_extraction(video_id)
3685 limit = self._JUSTIN_PAGE_LIMIT
3688 self.report_download_page(video_id, offset)
3689 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3690 page_count, page_info = self._parse_page(page_url)
3691 info.extend(page_info)
3692 if not paged or page_count != limit:
3697 class FunnyOrDieIE(InfoExtractor):
3698 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3700 def _real_extract(self, url):
3701 mobj = re.match(self._VALID_URL, url)
3703 self._downloader.report_error(u'invalid URL: %s' % url)
3706 video_id = mobj.group('id')
3707 webpage = self._download_webpage(url, video_id)
3709 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3711 self._downloader.report_error(u'unable to find video information')
3712 video_url = unescapeHTML(m.group('url'))
3714 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3716 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3718 self._downloader.trouble(u'Cannot find video title')
3719 title = clean_html(m.group('title'))
3721 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3723 desc = unescapeHTML(m.group('desc'))
3732 'description': desc,
3736 class SteamIE(InfoExtractor):
3737 _VALID_URL = r"""http://store.steampowered.com/
3738 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3740 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3744 def suitable(cls, url):
3745 """Receives a URL and returns True if suitable for this IE."""
3746 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3748 def _real_extract(self, url):
3749 m = re.match(self._VALID_URL, url, re.VERBOSE)
3750 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3751 gameID = m.group('gameID')
3752 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3753 webpage = self._download_webpage(videourl, gameID)
3754 mweb = re.finditer(urlRE, webpage)
3755 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3756 titles = re.finditer(namesRE, webpage)
3757 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3758 thumbs = re.finditer(thumbsRE, webpage)
3760 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3761 video_id = vid.group('videoID')
3762 title = vtitle.group('videoName')
3763 video_url = vid.group('videoURL')
3764 video_thumb = thumb.group('thumbnail')
3766 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3771 'title': unescapeHTML(title),
3772 'thumbnail': video_thumb
3777 class UstreamIE(InfoExtractor):
3778 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3779 IE_NAME = u'ustream'
3781 def _real_extract(self, url):
3782 m = re.match(self._VALID_URL, url)
3783 video_id = m.group('videoID')
3784 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3785 webpage = self._download_webpage(url, video_id)
3786 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3787 title = m.group('title')
3788 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3789 uploader = m.group('uploader')
3795 'uploader': uploader
3799 class WorldStarHipHopIE(InfoExtractor):
3800 _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3801 IE_NAME = u'WorldStarHipHop'
3803 def _real_extract(self, url):
3804 _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3806 webpage_src = compat_urllib_request.urlopen(url).read()
3807 webpage_src = webpage_src.decode('utf-8')
3809 mobj = re.search(_src_url, webpage_src)
3811 m = re.match(self._VALID_URL, url)
3812 video_id = m.group('id')
3814 if mobj is not None:
3815 video_url = mobj.group()
3816 if 'mp4' in video_url:
3821 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3824 _title = r"""<title>(.*)</title>"""
3826 mobj = re.search(_title, webpage_src)
3828 if mobj is not None:
3829 title = mobj.group(1)
3831 title = 'World Start Hip Hop - %s' % time.ctime()
3833 _thumbnail = r"""rel="image_src" href="(.*)" />"""
3834 mobj = re.search(_thumbnail, webpage_src)
3836 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3837 if mobj is not None:
3838 thumbnail = mobj.group(1)
3840 _title = r"""candytitles.*>(.*)</span>"""
3841 mobj = re.search(_title, webpage_src)
3842 if mobj is not None:
3843 title = mobj.group(1)
3850 'thumbnail' : thumbnail,
3855 class RBMARadioIE(InfoExtractor):
3856 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3858 def _real_extract(self, url):
3859 m = re.match(self._VALID_URL, url)
3860 video_id = m.group('videoID')
3862 webpage = self._download_webpage(url, video_id)
3863 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3865 raise ExtractorError(u'Cannot find metadata')
3866 json_data = m.group(1)
3869 data = json.loads(json_data)
3870 except ValueError as e:
3871 raise ExtractorError(u'Invalid JSON: ' + str(e))
3873 video_url = data['akamai_url'] + '&cbr=256'
3874 url_parts = compat_urllib_parse_urlparse(video_url)
3875 video_ext = url_parts.path.rpartition('.')[2]
3880 'title': data['title'],
3881 'description': data.get('teaser_text'),
3882 'location': data.get('country_of_origin'),
3883 'uploader': data.get('host', {}).get('name'),
3884 'uploader_id': data.get('host', {}).get('slug'),
3885 'thumbnail': data.get('image', {}).get('large_url_2x'),
3886 'duration': data.get('duration'),
3891 class YouPornIE(InfoExtractor):
3892 """Information extractor for youporn.com."""
3893 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3895 def _print_formats(self, formats):
3896 """Print all available formats"""
3897 print(u'Available formats:')
3898 print(u'ext\t\tformat')
3899 print(u'---------------------------------')
3900 for format in formats:
3901 print(u'%s\t\t%s' % (format['ext'], format['format']))
3903 def _specific(self, req_format, formats):
3905 if(x["format"]==req_format):
3909 def _real_extract(self, url):
3910 mobj = re.match(self._VALID_URL, url)
3912 self._downloader.report_error(u'invalid URL: %s' % url)
3915 video_id = mobj.group('videoid')
3917 req = compat_urllib_request.Request(url)
3918 req.add_header('Cookie', 'age_verified=1')
3919 webpage = self._download_webpage(req, video_id)
3921 # Get the video title
3922 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3924 raise ExtractorError(u'Unable to extract video title')
3925 video_title = result.group('title').strip()
3927 # Get the video date
3928 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3930 self._downloader.report_warning(u'unable to extract video date')
3933 upload_date = result.group('date').strip()
3935 # Get the video uploader
3936 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3938 self._downloader.report_warning(u'unable to extract uploader')
3939 video_uploader = None
3941 video_uploader = result.group('uploader').strip()
3942 video_uploader = clean_html( video_uploader )
3944 # Get all of the formats available
3945 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3946 result = re.search(DOWNLOAD_LIST_RE, webpage)
3948 raise ExtractorError(u'Unable to extract download list')
3949 download_list_html = result.group('download_list').strip()
3951 # Get all of the links from the page
3952 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3953 links = re.findall(LINK_RE, download_list_html)
3954 if(len(links) == 0):
3955 raise ExtractorError(u'ERROR: no known formats available for video')
3957 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3962 # A link looks like this:
3963 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3964 # A path looks like this:
3965 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3966 video_url = unescapeHTML( link )
3967 path = compat_urllib_parse_urlparse( video_url ).path
3968 extension = os.path.splitext( path )[1][1:]
3969 format = path.split('/')[4].split('_')[:2]
3972 format = "-".join( format )
3973 title = u'%s-%s-%s' % (video_title, size, bitrate)
3978 'uploader': video_uploader,
3979 'upload_date': upload_date,
3984 'description': None,
3988 if self._downloader.params.get('listformats', None):
3989 self._print_formats(formats)
3992 req_format = self._downloader.params.get('format', None)
3993 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3995 if req_format is None or req_format == 'best':
3997 elif req_format == 'worst':
3998 return [formats[-1]]
3999 elif req_format in ('-1', 'all'):
4002 format = self._specific( req_format, formats )
4004 self._downloader.report_error(u'requested format not available')
4010 class PornotubeIE(InfoExtractor):
4011 """Information extractor for pornotube.com."""
4012 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
4014 def _real_extract(self, url):
4015 mobj = re.match(self._VALID_URL, url)
4017 self._downloader.report_error(u'invalid URL: %s' % url)
4020 video_id = mobj.group('videoid')
4021 video_title = mobj.group('title')
4023 # Get webpage content
4024 webpage = self._download_webpage(url, video_id)
4027 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
4028 result = re.search(VIDEO_URL_RE, webpage)
4030 self._downloader.report_error(u'unable to extract video url')
4032 video_url = compat_urllib_parse.unquote(result.group('url'))
4034 #Get the uploaded date
4035 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4036 result = re.search(VIDEO_UPLOADED_RE, webpage)
4038 self._downloader.report_error(u'unable to extract video title')
4040 upload_date = result.group('date')
4042 info = {'id': video_id,
4045 'upload_date': upload_date,
4046 'title': video_title,
4052 class YouJizzIE(InfoExtractor):
4053 """Information extractor for youjizz.com."""
4054 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
4056 def _real_extract(self, url):
4057 mobj = re.match(self._VALID_URL, url)
4059 self._downloader.report_error(u'invalid URL: %s' % url)
4062 video_id = mobj.group('videoid')
4064 # Get webpage content
4065 webpage = self._download_webpage(url, video_id)
4067 # Get the video title
4068 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
4070 raise ExtractorError(u'ERROR: unable to extract video title')
4071 video_title = result.group('title').strip()
4073 # Get the embed page
4074 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
4076 raise ExtractorError(u'ERROR: unable to extract embed page')
4078 embed_page_url = result.group(0).strip()
4079 video_id = result.group('videoid')
4081 webpage = self._download_webpage(embed_page_url, video_id)
4084 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
4086 raise ExtractorError(u'ERROR: unable to extract video url')
4087 video_url = result.group('source')
4089 info = {'id': video_id,
4091 'title': video_title,
4094 'player_url': embed_page_url}
4098 class EightTracksIE(InfoExtractor):
4100 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4102 def _real_extract(self, url):
4103 mobj = re.match(self._VALID_URL, url)
4105 raise ExtractorError(u'Invalid URL: %s' % url)
4106 playlist_id = mobj.group('id')
4108 webpage = self._download_webpage(url, playlist_id)
4110 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4112 raise ExtractorError(u'Cannot find trax information')
4113 json_like = m.group(1)
4114 data = json.loads(json_like)
4116 session = str(random.randint(0, 1000000000))
4118 track_count = data['tracks_count']
4119 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4120 next_url = first_url
4122 for i in itertools.count():
4123 api_json = self._download_webpage(next_url, playlist_id,
4124 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4125 errnote=u'Failed to download song information')
4126 api_data = json.loads(api_json)
4127 track_data = api_data[u'set']['track']
4129 'id': track_data['id'],
4130 'url': track_data['track_file_stream_url'],
4131 'title': track_data['performer'] + u' - ' + track_data['name'],
4132 'raw_title': track_data['name'],
4133 'uploader_id': data['user']['login'],
4137 if api_data['set']['at_last_track']:
4139 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4142 class KeekIE(InfoExtractor):
4143 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4146 def _real_extract(self, url):
4147 m = re.match(self._VALID_URL, url)
4148 video_id = m.group('videoID')
4149 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4150 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4151 webpage = self._download_webpage(url, video_id)
4152 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4153 title = unescapeHTML(m.group('title'))
4154 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4155 uploader = clean_html(m.group('uploader'))
4161 'thumbnail': thumbnail,
4162 'uploader': uploader
4166 class TEDIE(InfoExtractor):
4167 _VALID_URL=r'''http://www.ted.com/
4169 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4171 ((?P<type_talk>talks)) # We have a simple talk
4173 /(?P<name>\w+) # Here goes the name and then ".html"
4177 def suitable(cls, url):
4178 """Receives a URL and returns True if suitable for this IE."""
4179 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4181 def _real_extract(self, url):
4182 m=re.match(self._VALID_URL, url, re.VERBOSE)
4183 if m.group('type_talk'):
4184 return [self._talk_info(url)]
4186 playlist_id=m.group('playlist_id')
4187 name=m.group('name')
4188 self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4189 return self._playlist_videos_info(url,name,playlist_id)
4191 def _talk_video_link(self,mediaSlug):
4192 '''Returns the video link for that mediaSlug'''
4193 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4195 def _playlist_videos_info(self,url,name,playlist_id=0):
4196 '''Returns the videos of the playlist'''
4198 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4199 ([.\s]*?)data-playlist_item_id="(\d+)"
4200 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4202 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4203 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4204 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4205 m_names=re.finditer(video_name_RE,webpage)
4207 for m_video, m_name in zip(m_videos,m_names):
4208 video_id=m_video.group('video_id')
4209 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4210 info.append(self._talk_info(talk_url,video_id))
4213 def _talk_info(self, url, video_id=0):
4214 """Return the video for the talk in the url"""
4215 m=re.match(self._VALID_URL, url,re.VERBOSE)
4216 videoName=m.group('name')
4217 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4218 # If the url includes the language we get the title translated
4219 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4220 title=re.search(title_RE, webpage).group('title')
4221 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4222 "id":(?P<videoID>[\d]+).*?
4223 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4224 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4225 thumb_match=re.search(thumb_RE,webpage)
4226 info_match=re.search(info_RE,webpage,re.VERBOSE)
4227 video_id=info_match.group('videoID')
4228 mediaSlug=info_match.group('mediaSlug')
4229 video_url=self._talk_video_link(mediaSlug)
4235 'thumbnail': thumb_match.group('thumbnail')
4239 class MySpassIE(InfoExtractor):
4240 _VALID_URL = r'http://www.myspass.de/.*'
4242 def _real_extract(self, url):
4243 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4245 # video id is the last path element of the URL
4246 # usually there is a trailing slash, so also try the second but last
4247 url_path = compat_urllib_parse_urlparse(url).path
4248 url_parent_path, video_id = os.path.split(url_path)
4250 _, video_id = os.path.split(url_parent_path)
4253 metadata_url = META_DATA_URL_TEMPLATE % video_id
4254 metadata_text = self._download_webpage(metadata_url, video_id)
4255 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4257 # extract values from metadata
4258 url_flv_el = metadata.find('url_flv')
4259 if url_flv_el is None:
4260 self._downloader.report_error(u'unable to extract download url')
4262 video_url = url_flv_el.text
4263 extension = os.path.splitext(video_url)[1][1:]
4264 title_el = metadata.find('title')
4265 if title_el is None:
4266 self._downloader.report_error(u'unable to extract title')
4268 title = title_el.text
4269 format_id_el = metadata.find('format_id')
4270 if format_id_el is None:
4273 format = format_id_el.text
4274 description_el = metadata.find('description')
4275 if description_el is not None:
4276 description = description_el.text
4279 imagePreview_el = metadata.find('imagePreview')
4280 if imagePreview_el is not None:
4281 thumbnail = imagePreview_el.text
4290 'thumbnail': thumbnail,
4291 'description': description
4295 class SpiegelIE(InfoExtractor):
4296 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4298 def _real_extract(self, url):
4299 m = re.match(self._VALID_URL, url)
4300 video_id = m.group('videoID')
4302 webpage = self._download_webpage(url, video_id)
4303 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4305 raise ExtractorError(u'Cannot find title')
4306 video_title = unescapeHTML(m.group(1))
4308 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4309 xml_code = self._download_webpage(xml_url, video_id,
4310 note=u'Downloading XML', errnote=u'Failed to download XML')
4312 idoc = xml.etree.ElementTree.fromstring(xml_code)
4313 last_type = idoc[-1]
4314 filename = last_type.findall('./filename')[0].text
4315 duration = float(last_type.findall('./duration')[0].text)
4317 video_url = 'http://video2.spiegel.de/flash/' + filename
4318 video_ext = filename.rpartition('.')[2]
4323 'title': video_title,
4324 'duration': duration,
4328 class LiveLeakIE(InfoExtractor):
4330 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4331 IE_NAME = u'liveleak'
4333 def _real_extract(self, url):
4334 mobj = re.match(self._VALID_URL, url)
4336 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4339 video_id = mobj.group('video_id')
4341 webpage = self._download_webpage(url, video_id)
4343 m = re.search(r'file: "(.*?)",', webpage)
4345 self._downloader.report_error(u'unable to find video url')
4347 video_url = m.group(1)
4349 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4351 self._downloader.trouble(u'Cannot find video title')
4352 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4354 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4356 desc = unescapeHTML(m.group('desc'))
4360 m = re.search(r'By:.*?(\w+)</a>', webpage)
4362 uploader = clean_html(m.group(1))
4371 'description': desc,
4372 'uploader': uploader
4377 class ARDIE(InfoExtractor):
4378 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4379 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4380 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4382 def _real_extract(self, url):
4383 # determine video id from url
4384 m = re.match(self._VALID_URL, url)
4386 numid = re.search(r'documentId=([0-9]+)', url)
4388 video_id = numid.group(1)
4390 video_id = m.group('video_id')
4392 # determine title and media streams from webpage
4393 html = self._download_webpage(url, video_id)
4394 title = re.search(self._TITLE, html).group('title')
4395 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4397 assert '"fsk"' in html
4398 self._downloader.report_error(u'this video is only available after 8:00 pm')
4401 # choose default media type and highest quality for now
4402 stream = max([s for s in streams if int(s["media_type"]) == 0],
4403 key=lambda s: int(s["quality"]))
4405 # there's two possibilities: RTMP stream or HTTP download
4406 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4407 if stream['rtmp_url']:
4408 self._downloader.to_screen(u'[%s] RTMP download detected' % self.IE_NAME)
4409 assert stream['video_url'].startswith('mp4:')
4410 info["url"] = stream["rtmp_url"]
4411 info["play_path"] = stream['video_url']
4413 assert stream["video_url"].endswith('.mp4')
4414 info["url"] = stream["video_url"]
4418 def gen_extractors():
4419 """ Return a list of an instance of every supported extractor.
4420 The order does matter; the first extractor matched is the one handling the URL.
4423 YoutubePlaylistIE(),
4448 StanfordOpenClassroomIE(),
4458 WorldStarHipHopIE(),