2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 note = u'Downloading video webpage'
118 if note is not False:
119 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
121 return compat_urllib_request.urlopen(url_or_request)
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
124 errnote = u'Unable to download webpage'
125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
127 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns the data of the page as a string """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
133 encoding = m.group(1)
136 webpage_bytes = urlh.read()
137 if self._downloader.params.get('dump_intermediate_pages', False):
139 url = url_or_request.get_full_url()
140 except AttributeError:
142 self._downloader.to_screen(u'Dumping request to ' + url)
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
145 return webpage_bytes.decode(encoding, 'replace')
147 #Methods for following #608
148 #They set the correct value of the '_type' key
149 def video_result(self, video_info):
150 """Returns a video"""
151 video_info['_type'] = 'video'
153 def url_result(self, url, ie=None):
154 """Returns a url that points to a page that should be processed"""
155 #TODO: ie should be the class used for getting the info
156 video_info = {'_type': 'url',
160 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
161 """Returns a playlist"""
162 video_info = {'_type': 'playlist',
165 video_info['id'] = playlist_id
167 video_info['title'] = playlist_title
171 class YoutubeIE(InfoExtractor):
172 """Information extractor for youtube.com."""
176 (?:https?://)? # http(s):// (optional)
177 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
178 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
179 (?:.*?\#/)? # handle anchor (#/) redirect urls
180 (?: # the various things that can precede the ID:
181 (?:(?:v|embed|e)/) # v/ or embed/ or e/
182 |(?: # or the v= param in all its forms
183 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
184 (?:\?|\#!?) # the params delimiter ? or # or #!
185 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
188 )? # optional -> youtube.com/xxxx is OK
189 )? # all until now is optional -> you can pass the naked ID
190 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
191 (?(1).+)? # if we found the ID, everything can follow
193 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
194 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
195 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
196 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
197 _NETRC_MACHINE = 'youtube'
198 # Listed in order of quality
199 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
200 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
201 _video_extensions = {
207 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
213 _video_dimensions = {
232 def suitable(cls, url):
233 """Receives a URL and returns True if suitable for this IE."""
234 if YoutubePlaylistIE.suitable(url): return False
235 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
237 def report_lang(self):
238 """Report attempt to set language."""
239 self._downloader.to_screen(u'[youtube] Setting language')
241 def report_login(self):
242 """Report attempt to log in."""
243 self._downloader.to_screen(u'[youtube] Logging in')
245 def report_age_confirmation(self):
246 """Report attempt to confirm age."""
247 self._downloader.to_screen(u'[youtube] Confirming age')
249 def report_video_webpage_download(self, video_id):
250 """Report attempt to download video webpage."""
251 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
253 def report_video_info_webpage_download(self, video_id):
254 """Report attempt to download video info webpage."""
255 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
257 def report_video_subtitles_download(self, video_id):
258 """Report attempt to download video info webpage."""
259 self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
261 def report_video_subtitles_request(self, video_id, sub_lang, format):
262 """Report attempt to download video info webpage."""
263 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
265 def report_video_subtitles_available(self, video_id, sub_lang_list):
266 """Report available subtitles."""
267 sub_lang = ",".join(list(sub_lang_list.keys()))
268 self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
270 def report_information_extraction(self, video_id):
271 """Report attempt to extract video information."""
272 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
274 def report_unavailable_format(self, video_id, format):
275 """Report extracted video URL."""
276 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
278 def report_rtmp_download(self):
279 """Indicate the download will use the RTMP protocol."""
280 self._downloader.to_screen(u'[youtube] RTMP download detected')
282 def _get_available_subtitles(self, video_id):
283 self.report_video_subtitles_download(video_id)
284 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
286 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
287 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
288 return (u'unable to download video subtitles: %s' % compat_str(err), None)
289 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
290 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
291 if not sub_lang_list:
292 return (u'video doesn\'t have subtitles', None)
295 def _list_available_subtitles(self, video_id):
296 sub_lang_list = self._get_available_subtitles(video_id)
297 self.report_video_subtitles_available(video_id, sub_lang_list)
299 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
302 (error_message, sub_lang, sub)
304 self.report_video_subtitles_request(video_id, sub_lang, format)
305 params = compat_urllib_parse.urlencode({
311 url = 'http://www.youtube.com/api/timedtext?' + params
313 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
314 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
315 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
317 return (u'Did not fetch video subtitles', None, None)
318 return (None, sub_lang, sub)
320 def _extract_subtitle(self, video_id):
322 Return a list with a tuple:
323 [(error_message, sub_lang, sub)]
325 sub_lang_list = self._get_available_subtitles(video_id)
326 sub_format = self._downloader.params.get('subtitlesformat')
327 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
328 return [(sub_lang_list[0], None, None)]
329 if self._downloader.params.get('subtitleslang', False):
330 sub_lang = self._downloader.params.get('subtitleslang')
331 elif 'en' in sub_lang_list:
334 sub_lang = list(sub_lang_list.keys())[0]
335 if not sub_lang in sub_lang_list:
336 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
338 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
341 def _extract_all_subtitles(self, video_id):
342 sub_lang_list = self._get_available_subtitles(video_id)
343 sub_format = self._downloader.params.get('subtitlesformat')
344 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
345 return [(sub_lang_list[0], None, None)]
347 for sub_lang in sub_lang_list:
348 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
349 subtitles.append(subtitle)
352 def _print_formats(self, formats):
353 print('Available formats:')
355 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
357 def _real_initialize(self):
358 if self._downloader is None:
363 downloader_params = self._downloader.params
365 # Attempt to use provided username and password or .netrc data
366 if downloader_params.get('username', None) is not None:
367 username = downloader_params['username']
368 password = downloader_params['password']
369 elif downloader_params.get('usenetrc', False):
371 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
376 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
377 except (IOError, netrc.NetrcParseError) as err:
378 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
382 request = compat_urllib_request.Request(self._LANG_URL)
385 compat_urllib_request.urlopen(request).read()
386 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
387 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
390 # No authentication to be performed
394 request = compat_urllib_request.Request(self._LOGIN_URL)
396 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
397 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
398 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
403 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
405 galx = match.group(1)
407 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
413 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
417 u'PersistentCookie': u'yes',
419 u'bgresponse': u'js_disabled',
420 u'checkConnection': u'',
421 u'checkedDomains': u'youtube',
427 u'signIn': u'Sign in',
429 u'service': u'youtube',
433 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
435 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
436 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
437 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
440 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
441 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
442 self._downloader.report_warning(u'unable to log in: bad username or password')
444 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
445 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
451 'action_confirm': 'Confirm',
453 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
455 self.report_age_confirmation()
456 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
457 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
458 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
461 def _extract_id(self, url):
462 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
464 self._downloader.report_error(u'invalid URL: %s' % url)
466 video_id = mobj.group(2)
469 def _real_extract(self, url):
470 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
471 mobj = re.search(self._NEXT_URL_RE, url)
473 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
474 video_id = self._extract_id(url)
477 self.report_video_webpage_download(video_id)
478 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
479 request = compat_urllib_request.Request(url)
481 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
482 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
483 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
486 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
488 # Attempt to extract SWF player URL
489 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
491 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
496 self.report_video_info_webpage_download(video_id)
497 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
498 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
499 % (video_id, el_type))
500 video_info_webpage = self._download_webpage(video_info_url, video_id,
502 errnote='unable to download video info webpage')
503 video_info = compat_parse_qs(video_info_webpage)
504 if 'token' in video_info:
506 if 'token' not in video_info:
507 if 'reason' in video_info:
508 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
510 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
513 # Check for "rental" videos
514 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
515 self._downloader.report_error(u'"rental" videos not supported')
518 # Start extracting information
519 self.report_information_extraction(video_id)
522 if 'author' not in video_info:
523 self._downloader.report_error(u'unable to extract uploader name')
525 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
528 video_uploader_id = None
529 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
531 video_uploader_id = mobj.group(1)
533 self._downloader.report_warning(u'unable to extract uploader nickname')
536 if 'title' not in video_info:
537 self._downloader.report_error(u'unable to extract video title')
539 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
542 if 'thumbnail_url' not in video_info:
543 self._downloader.report_warning(u'unable to extract video thumbnail')
545 else: # don't panic if we can't find it
546 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
550 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
552 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
553 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
554 for expression in format_expressions:
556 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
561 video_description = get_element_by_id("eow-description", video_webpage)
562 if video_description:
563 video_description = clean_html(video_description)
565 video_description = ''
568 video_subtitles = None
570 if self._downloader.params.get('writesubtitles', False):
571 video_subtitles = self._extract_subtitle(video_id)
573 (sub_error, sub_lang, sub) = video_subtitles[0]
575 self._downloader.report_error(sub_error)
577 if self._downloader.params.get('allsubtitles', False):
578 video_subtitles = self._extract_all_subtitles(video_id)
579 for video_subtitle in video_subtitles:
580 (sub_error, sub_lang, sub) = video_subtitle
582 self._downloader.report_error(sub_error)
584 if self._downloader.params.get('listsubtitles', False):
585 sub_lang_list = self._list_available_subtitles(video_id)
588 if 'length_seconds' not in video_info:
589 self._downloader.report_warning(u'unable to extract video duration')
592 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
595 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
597 # Decide which formats to download
598 req_format = self._downloader.params.get('format', None)
600 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
601 self.report_rtmp_download()
602 video_url_list = [(None, video_info['conn'][0])]
603 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
604 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
605 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
606 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
607 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
609 format_limit = self._downloader.params.get('format_limit', None)
610 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
611 if format_limit is not None and format_limit in available_formats:
612 format_list = available_formats[available_formats.index(format_limit):]
614 format_list = available_formats
615 existing_formats = [x for x in format_list if x in url_map]
616 if len(existing_formats) == 0:
617 self._downloader.report_error(u'no known formats available for video')
619 if self._downloader.params.get('listformats', None):
620 self._print_formats(existing_formats)
622 if req_format is None or req_format == 'best':
623 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
624 elif req_format == 'worst':
625 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
626 elif req_format in ('-1', 'all'):
627 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
629 # Specific formats. We pick the first in a slash-delimeted sequence.
630 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
631 req_formats = req_format.split('/')
632 video_url_list = None
633 for rf in req_formats:
635 video_url_list = [(rf, url_map[rf])]
637 if video_url_list is None:
638 self._downloader.report_error(u'requested format not available')
641 self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
645 for format_param, video_real_url in video_url_list:
647 video_extension = self._video_extensions.get(format_param, 'flv')
649 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
650 self._video_dimensions.get(format_param, '???'))
654 'url': video_real_url,
655 'uploader': video_uploader,
656 'uploader_id': video_uploader_id,
657 'upload_date': upload_date,
658 'title': video_title,
659 'ext': video_extension,
660 'format': video_format,
661 'thumbnail': video_thumbnail,
662 'description': video_description,
663 'player_url': player_url,
664 'subtitles': video_subtitles,
665 'duration': video_duration
670 class MetacafeIE(InfoExtractor):
671 """Information Extractor for metacafe.com."""
673 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
674 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
675 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
676 IE_NAME = u'metacafe'
678 def __init__(self, downloader=None):
679 InfoExtractor.__init__(self, downloader)
681 def report_disclaimer(self):
682 """Report disclaimer retrieval."""
683 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
685 def report_age_confirmation(self):
686 """Report attempt to confirm age."""
687 self._downloader.to_screen(u'[metacafe] Confirming age')
689 def report_download_webpage(self, video_id):
690 """Report webpage download."""
691 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
693 def report_extraction(self, video_id):
694 """Report information extraction."""
695 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
697 def _real_initialize(self):
698 # Retrieve disclaimer
699 request = compat_urllib_request.Request(self._DISCLAIMER)
701 self.report_disclaimer()
702 disclaimer = compat_urllib_request.urlopen(request).read()
703 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
704 self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
710 'submit': "Continue - I'm over 18",
712 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
714 self.report_age_confirmation()
715 disclaimer = compat_urllib_request.urlopen(request).read()
716 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
717 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
720 def _real_extract(self, url):
721 # Extract id and simplified title from URL
722 mobj = re.match(self._VALID_URL, url)
724 self._downloader.report_error(u'invalid URL: %s' % url)
727 video_id = mobj.group(1)
729 # Check if video comes from YouTube
730 mobj2 = re.match(r'^yt-(.*)$', video_id)
731 if mobj2 is not None:
732 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
734 # Retrieve video webpage to extract further information
735 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
737 # Extract URL, uploader and title from webpage
738 self.report_extraction(video_id)
739 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
741 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
742 video_extension = mediaURL[-3:]
744 # Extract gdaKey if available
745 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
749 gdaKey = mobj.group(1)
750 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
752 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
754 self._downloader.report_error(u'unable to extract media URL')
756 vardict = compat_parse_qs(mobj.group(1))
757 if 'mediaData' not in vardict:
758 self._downloader.report_error(u'unable to extract media URL')
760 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
762 self._downloader.report_error(u'unable to extract media URL')
764 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
765 video_extension = mediaURL[-3:]
766 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
768 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
770 self._downloader.report_error(u'unable to extract title')
772 video_title = mobj.group(1).decode('utf-8')
774 mobj = re.search(r'submitter=(.*?);', webpage)
776 self._downloader.report_error(u'unable to extract uploader nickname')
778 video_uploader = mobj.group(1)
781 'id': video_id.decode('utf-8'),
782 'url': video_url.decode('utf-8'),
783 'uploader': video_uploader.decode('utf-8'),
785 'title': video_title,
786 'ext': video_extension.decode('utf-8'),
790 class DailymotionIE(InfoExtractor):
791 """Information Extractor for Dailymotion"""
793 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
794 IE_NAME = u'dailymotion'
797 def __init__(self, downloader=None):
798 InfoExtractor.__init__(self, downloader)
800 def report_extraction(self, video_id):
801 """Report information extraction."""
802 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
804 def _real_extract(self, url):
805 # Extract id and simplified title from URL
806 mobj = re.match(self._VALID_URL, url)
808 self._downloader.report_error(u'invalid URL: %s' % url)
811 video_id = mobj.group(1).split('_')[0].split('?')[0]
813 video_extension = 'mp4'
815 # Retrieve video webpage to extract further information
816 request = compat_urllib_request.Request(url)
817 request.add_header('Cookie', 'family_filter=off')
818 webpage = self._download_webpage(request, video_id)
820 # Extract URL, uploader and title from webpage
821 self.report_extraction(video_id)
822 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
824 self._downloader.report_error(u'unable to extract media URL')
826 flashvars = compat_urllib_parse.unquote(mobj.group(1))
828 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
831 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
834 self._downloader.report_error(u'unable to extract video URL')
837 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
839 self._downloader.report_error(u'unable to extract video URL')
842 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
844 # TODO: support choosing qualities
846 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
848 self._downloader.report_error(u'unable to extract title')
850 video_title = unescapeHTML(mobj.group('title'))
852 video_uploader = None
853 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
855 # lookin for official user
856 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
857 if mobj_official is None:
858 self._downloader.report_warning(u'unable to extract uploader nickname')
860 video_uploader = mobj_official.group(1)
862 video_uploader = mobj.group(1)
864 video_upload_date = None
865 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
867 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
872 'uploader': video_uploader,
873 'upload_date': video_upload_date,
874 'title': video_title,
875 'ext': video_extension,
879 class PhotobucketIE(InfoExtractor):
880 """Information extractor for photobucket.com."""
882 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
883 IE_NAME = u'photobucket'
885 def __init__(self, downloader=None):
886 InfoExtractor.__init__(self, downloader)
888 def report_download_webpage(self, video_id):
889 """Report webpage download."""
890 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
892 def report_extraction(self, video_id):
893 """Report information extraction."""
894 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
896 def _real_extract(self, url):
897 # Extract id from URL
898 mobj = re.match(self._VALID_URL, url)
900 self._downloader.report_error(u'Invalid URL: %s' % url)
903 video_id = mobj.group(1)
905 video_extension = 'flv'
907 # Retrieve video webpage to extract further information
908 request = compat_urllib_request.Request(url)
910 self.report_download_webpage(video_id)
911 webpage = compat_urllib_request.urlopen(request).read()
912 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
913 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
916 # Extract URL, uploader, and title from webpage
917 self.report_extraction(video_id)
918 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
920 self._downloader.report_error(u'unable to extract media URL')
922 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
926 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
928 self._downloader.report_error(u'unable to extract title')
930 video_title = mobj.group(1).decode('utf-8')
932 video_uploader = mobj.group(2).decode('utf-8')
935 'id': video_id.decode('utf-8'),
936 'url': video_url.decode('utf-8'),
937 'uploader': video_uploader,
939 'title': video_title,
940 'ext': video_extension.decode('utf-8'),
944 class YahooIE(InfoExtractor):
945 """Information extractor for video.yahoo.com."""
948 # _VALID_URL matches all Yahoo! Video URLs
949 # _VPAGE_URL matches only the extractable '/watch/' URLs
950 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
951 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
952 IE_NAME = u'video.yahoo'
954 def __init__(self, downloader=None):
955 InfoExtractor.__init__(self, downloader)
957 def report_download_webpage(self, video_id):
958 """Report webpage download."""
959 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
961 def report_extraction(self, video_id):
962 """Report information extraction."""
963 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
965 def _real_extract(self, url, new_video=True):
966 # Extract ID from URL
967 mobj = re.match(self._VALID_URL, url)
969 self._downloader.report_error(u'Invalid URL: %s' % url)
972 video_id = mobj.group(2)
973 video_extension = 'flv'
975 # Rewrite valid but non-extractable URLs as
976 # extractable English language /watch/ URLs
977 if re.match(self._VPAGE_URL, url) is None:
978 request = compat_urllib_request.Request(url)
980 webpage = compat_urllib_request.urlopen(request).read()
981 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
982 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
985 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
987 self._downloader.report_error(u'Unable to extract id field')
989 yahoo_id = mobj.group(1)
991 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
993 self._downloader.report_error(u'Unable to extract vid field')
995 yahoo_vid = mobj.group(1)
997 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
998 return self._real_extract(url, new_video=False)
1000 # Retrieve video webpage to extract further information
1001 request = compat_urllib_request.Request(url)
1003 self.report_download_webpage(video_id)
1004 webpage = compat_urllib_request.urlopen(request).read()
1005 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1006 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1009 # Extract uploader and title from webpage
1010 self.report_extraction(video_id)
1011 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1013 self._downloader.report_error(u'unable to extract video title')
1015 video_title = mobj.group(1).decode('utf-8')
1017 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1019 self._downloader.report_error(u'unable to extract video uploader')
1021 video_uploader = mobj.group(1).decode('utf-8')
1023 # Extract video thumbnail
1024 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1026 self._downloader.report_error(u'unable to extract video thumbnail')
1028 video_thumbnail = mobj.group(1).decode('utf-8')
1030 # Extract video description
1031 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1033 self._downloader.report_error(u'unable to extract video description')
1035 video_description = mobj.group(1).decode('utf-8')
1036 if not video_description:
1037 video_description = 'No description available.'
1039 # Extract video height and width
1040 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1042 self._downloader.report_error(u'unable to extract video height')
1044 yv_video_height = mobj.group(1)
1046 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1048 self._downloader.report_error(u'unable to extract video width')
1050 yv_video_width = mobj.group(1)
1052 # Retrieve video playlist to extract media URL
1053 # I'm not completely sure what all these options are, but we
1054 # seem to need most of them, otherwise the server sends a 401.
1055 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1056 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1057 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1058 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1059 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1061 self.report_download_webpage(video_id)
1062 webpage = compat_urllib_request.urlopen(request).read()
1063 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1064 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1067 # Extract media URL from playlist XML
1068 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1070 self._downloader.report_error(u'Unable to extract media URL')
1072 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1073 video_url = unescapeHTML(video_url)
1076 'id': video_id.decode('utf-8'),
1078 'uploader': video_uploader,
1079 'upload_date': None,
1080 'title': video_title,
1081 'ext': video_extension.decode('utf-8'),
1082 'thumbnail': video_thumbnail.decode('utf-8'),
1083 'description': video_description,
1087 class VimeoIE(InfoExtractor):
1088 """Information extractor for vimeo.com."""
1090 # _VALID_URL matches Vimeo URLs
1091 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1094 def __init__(self, downloader=None):
1095 InfoExtractor.__init__(self, downloader)
1097 def report_download_webpage(self, video_id):
1098 """Report webpage download."""
1099 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1101 def report_extraction(self, video_id):
1102 """Report information extraction."""
1103 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1105 def _real_extract(self, url, new_video=True):
1106 # Extract ID from URL
1107 mobj = re.match(self._VALID_URL, url)
1109 self._downloader.report_error(u'Invalid URL: %s' % url)
1112 video_id = mobj.group('id')
1113 if not mobj.group('proto'):
1114 url = 'https://' + url
1115 if mobj.group('direct_link'):
1116 url = 'https://vimeo.com/' + video_id
1118 # Retrieve video webpage to extract further information
1119 request = compat_urllib_request.Request(url, None, std_headers)
1121 self.report_download_webpage(video_id)
1122 webpage_bytes = compat_urllib_request.urlopen(request).read()
1123 webpage = webpage_bytes.decode('utf-8')
1124 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1125 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1128 # Now we begin extracting as much information as we can from what we
1129 # retrieved. First we extract the information common to all extractors,
1130 # and latter we extract those that are Vimeo specific.
1131 self.report_extraction(video_id)
1133 # Extract the config JSON
1135 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1136 config = json.loads(config)
1138 self._downloader.report_error(u'unable to extract info section')
1142 video_title = config["video"]["title"]
1144 # Extract uploader and uploader_id
1145 video_uploader = config["video"]["owner"]["name"]
1146 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1148 # Extract video thumbnail
1149 video_thumbnail = config["video"]["thumbnail"]
1151 # Extract video description
1152 video_description = get_element_by_attribute("itemprop", "description", webpage)
1153 if video_description: video_description = clean_html(video_description)
1154 else: video_description = u''
1156 # Extract upload date
1157 video_upload_date = None
1158 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1159 if mobj is not None:
1160 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1162 # Vimeo specific: extract request signature and timestamp
1163 sig = config['request']['signature']
1164 timestamp = config['request']['timestamp']
1166 # Vimeo specific: extract video codec and quality information
1167 # First consider quality, then codecs, then take everything
1168 # TODO bind to format param
1169 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1170 files = { 'hd': [], 'sd': [], 'other': []}
1171 for codec_name, codec_extension in codecs:
1172 if codec_name in config["video"]["files"]:
1173 if 'hd' in config["video"]["files"][codec_name]:
1174 files['hd'].append((codec_name, codec_extension, 'hd'))
1175 elif 'sd' in config["video"]["files"][codec_name]:
1176 files['sd'].append((codec_name, codec_extension, 'sd'))
1178 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1180 for quality in ('hd', 'sd', 'other'):
1181 if len(files[quality]) > 0:
1182 video_quality = files[quality][0][2]
1183 video_codec = files[quality][0][0]
1184 video_extension = files[quality][0][1]
1185 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1188 self._downloader.report_error(u'no known codec found')
1191 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1192 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1197 'uploader': video_uploader,
1198 'uploader_id': video_uploader_id,
1199 'upload_date': video_upload_date,
1200 'title': video_title,
1201 'ext': video_extension,
1202 'thumbnail': video_thumbnail,
1203 'description': video_description,
1207 class ArteTvIE(InfoExtractor):
1208 """arte.tv information extractor."""
1210 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1211 _LIVE_URL = r'index-[0-9]+\.html$'
1213 IE_NAME = u'arte.tv'
1215 def __init__(self, downloader=None):
1216 InfoExtractor.__init__(self, downloader)
1218 def report_download_webpage(self, video_id):
1219 """Report webpage download."""
1220 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1222 def report_extraction(self, video_id):
1223 """Report information extraction."""
1224 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1226 def fetch_webpage(self, url):
1227 request = compat_urllib_request.Request(url)
1229 self.report_download_webpage(url)
1230 webpage = compat_urllib_request.urlopen(request).read()
1231 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1232 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1234 except ValueError as err:
1235 self._downloader.report_error(u'Invalid URL: %s' % url)
1239 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1240 page = self.fetch_webpage(url)
1241 mobj = re.search(regex, page, regexFlags)
1245 self._downloader.report_error(u'Invalid URL: %s' % url)
1248 for (i, key, err) in matchTuples:
1249 if mobj.group(i) is None:
1250 self._downloader.trouble(err)
1253 info[key] = mobj.group(i)
1257 def extractLiveStream(self, url):
1258 video_lang = url.split('/')[-4]
1259 info = self.grep_webpage(
1261 r'src="(.*?/videothek_js.*?\.js)',
1264 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1267 http_host = url.split('/')[2]
1268 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1269 info = self.grep_webpage(
1271 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1272 '(http://.*?\.swf).*?' +
1276 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1277 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1278 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1281 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1283 def extractPlus7Stream(self, url):
1284 video_lang = url.split('/')[-3]
1285 info = self.grep_webpage(
1287 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1290 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1293 next_url = compat_urllib_parse.unquote(info.get('url'))
1294 info = self.grep_webpage(
1296 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1299 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1302 next_url = compat_urllib_parse.unquote(info.get('url'))
1304 info = self.grep_webpage(
1306 r'<video id="(.*?)".*?>.*?' +
1307 '<name>(.*?)</name>.*?' +
1308 '<dateVideo>(.*?)</dateVideo>.*?' +
1309 '<url quality="hd">(.*?)</url>',
1312 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1313 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1314 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1315 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1320 'id': info.get('id'),
1321 'url': compat_urllib_parse.unquote(info.get('url')),
1322 'uploader': u'arte.tv',
1323 'upload_date': info.get('date'),
1324 'title': info.get('title').decode('utf-8'),
1330 def _real_extract(self, url):
1331 video_id = url.split('/')[-1]
1332 self.report_extraction(video_id)
1334 if re.search(self._LIVE_URL, video_id) is not None:
1335 self.extractLiveStream(url)
1338 info = self.extractPlus7Stream(url)
1343 class GenericIE(InfoExtractor):
1344 """Generic last-resort information extractor."""
1347 IE_NAME = u'generic'
1349 def __init__(self, downloader=None):
1350 InfoExtractor.__init__(self, downloader)
1352 def report_download_webpage(self, video_id):
1353 """Report webpage download."""
1354 if not self._downloader.params.get('test', False):
1355 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1356 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1358 def report_extraction(self, video_id):
1359 """Report information extraction."""
1360 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1362 def report_following_redirect(self, new_url):
1363 """Report information extraction."""
1364 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1366 def _test_redirect(self, url):
1367 """Check if it is a redirect, like url shorteners, in case return the new url."""
1368 class HeadRequest(compat_urllib_request.Request):
1369 def get_method(self):
1372 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1374 Subclass the HTTPRedirectHandler to make it use our
1375 HeadRequest also on the redirected URL
1377 def redirect_request(self, req, fp, code, msg, headers, newurl):
1378 if code in (301, 302, 303, 307):
1379 newurl = newurl.replace(' ', '%20')
1380 newheaders = dict((k,v) for k,v in req.headers.items()
1381 if k.lower() not in ("content-length", "content-type"))
1382 return HeadRequest(newurl,
1384 origin_req_host=req.get_origin_req_host(),
1387 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1389 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1391 Fallback to GET if HEAD is not allowed (405 HTTP error)
1393 def http_error_405(self, req, fp, code, msg, headers):
1397 newheaders = dict((k,v) for k,v in req.headers.items()
1398 if k.lower() not in ("content-length", "content-type"))
1399 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1401 origin_req_host=req.get_origin_req_host(),
1405 opener = compat_urllib_request.OpenerDirector()
1406 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1407 HTTPMethodFallback, HEADRedirectHandler,
1408 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1409 opener.add_handler(handler())
1411 response = opener.open(HeadRequest(url))
1412 new_url = response.geturl()
1417 self.report_following_redirect(new_url)
1420 def _real_extract(self, url):
1421 new_url = self._test_redirect(url)
1422 if new_url: return [self.url_result(new_url)]
1424 video_id = url.split('/')[-1]
1426 webpage = self._download_webpage(url, video_id)
1427 except ValueError as err:
1428 # since this is the last-resort InfoExtractor, if
1429 # this error is thrown, it'll be thrown here
1430 self._downloader.report_error(u'Invalid URL: %s' % url)
1433 self.report_extraction(video_id)
1434 # Start with something easy: JW Player in SWFObject
1435 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1437 # Broaden the search a little bit
1438 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1440 # Broaden the search a little bit: JWPlayer JS loader
1441 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1443 self._downloader.report_error(u'Invalid URL: %s' % url)
1446 # It's possible that one of the regexes
1447 # matched, but returned an empty group:
1448 if mobj.group(1) is None:
1449 self._downloader.report_error(u'Invalid URL: %s' % url)
1452 video_url = compat_urllib_parse.unquote(mobj.group(1))
1453 video_id = os.path.basename(video_url)
1455 # here's a fun little line of code for you:
1456 video_extension = os.path.splitext(video_id)[1][1:]
1457 video_id = os.path.splitext(video_id)[0]
1459 # it's tempting to parse this further, but you would
1460 # have to take into account all the variations like
1461 # Video Title - Site Name
1462 # Site Name | Video Title
1463 # Video Title - Tagline | Site Name
1464 # and so on and so forth; it's just not practical
1465 mobj = re.search(r'<title>(.*)</title>', webpage)
1467 self._downloader.report_error(u'unable to extract title')
1469 video_title = mobj.group(1)
1471 # video uploader is domain name
1472 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1474 self._downloader.report_error(u'unable to extract title')
1476 video_uploader = mobj.group(1)
1481 'uploader': video_uploader,
1482 'upload_date': None,
1483 'title': video_title,
1484 'ext': video_extension,
1488 class YoutubeSearchIE(InfoExtractor):
1489 """Information Extractor for YouTube search queries."""
1490 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1491 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1492 _max_youtube_results = 1000
1493 IE_NAME = u'youtube:search'
1495 def __init__(self, downloader=None):
1496 InfoExtractor.__init__(self, downloader)
1498 def report_download_page(self, query, pagenum):
1499 """Report attempt to download search page with given number."""
1500 query = query.decode(preferredencoding())
1501 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1503 def _real_extract(self, query):
1504 mobj = re.match(self._VALID_URL, query)
1506 self._downloader.report_error(u'invalid search query "%s"' % query)
1509 prefix, query = query.split(':')
1511 query = query.encode('utf-8')
1513 self._download_n_results(query, 1)
1515 elif prefix == 'all':
1516 self._download_n_results(query, self._max_youtube_results)
1522 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1524 elif n > self._max_youtube_results:
1525 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1526 n = self._max_youtube_results
1527 self._download_n_results(query, n)
1529 except ValueError: # parsing prefix as integer fails
1530 self._download_n_results(query, 1)
1533 def _download_n_results(self, query, n):
1534 """Downloads a specified number of results for a query"""
1540 while (50 * pagenum) < limit:
1541 self.report_download_page(query, pagenum+1)
1542 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1543 request = compat_urllib_request.Request(result_url)
1545 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1546 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1547 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1549 api_response = json.loads(data)['data']
1551 if not 'items' in api_response:
1552 self._downloader.trouble(u'[youtube] No video results')
1555 new_ids = list(video['id'] for video in api_response['items'])
1556 video_ids += new_ids
1558 limit = min(n, api_response['totalItems'])
1561 if len(video_ids) > n:
1562 video_ids = video_ids[:n]
1563 for id in video_ids:
1564 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1568 class GoogleSearchIE(InfoExtractor):
1569 """Information Extractor for Google Video search queries."""
1570 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1571 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1572 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1573 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1574 _max_google_results = 1000
1575 IE_NAME = u'video.google:search'
1577 def __init__(self, downloader=None):
1578 InfoExtractor.__init__(self, downloader)
1580 def report_download_page(self, query, pagenum):
1581 """Report attempt to download playlist page with given number."""
1582 query = query.decode(preferredencoding())
1583 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1585 def _real_extract(self, query):
1586 mobj = re.match(self._VALID_URL, query)
1588 self._downloader.report_error(u'invalid search query "%s"' % query)
1591 prefix, query = query.split(':')
1593 query = query.encode('utf-8')
1595 self._download_n_results(query, 1)
1597 elif prefix == 'all':
1598 self._download_n_results(query, self._max_google_results)
1604 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1606 elif n > self._max_google_results:
1607 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1608 n = self._max_google_results
1609 self._download_n_results(query, n)
1611 except ValueError: # parsing prefix as integer fails
1612 self._download_n_results(query, 1)
1615 def _download_n_results(self, query, n):
1616 """Downloads a specified number of results for a query"""
1622 self.report_download_page(query, pagenum)
1623 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1624 request = compat_urllib_request.Request(result_url)
1626 page = compat_urllib_request.urlopen(request).read()
1627 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1628 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1631 # Extract video identifiers
1632 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1633 video_id = mobj.group(1)
1634 if video_id not in video_ids:
1635 video_ids.append(video_id)
1636 if len(video_ids) == n:
1637 # Specified n videos reached
1638 for id in video_ids:
1639 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1642 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1643 for id in video_ids:
1644 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1647 pagenum = pagenum + 1
1650 class YahooSearchIE(InfoExtractor):
1651 """Information Extractor for Yahoo! Video search queries."""
1654 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1655 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1656 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1657 _MORE_PAGES_INDICATOR = r'\s*Next'
1658 _max_yahoo_results = 1000
1659 IE_NAME = u'video.yahoo:search'
1661 def __init__(self, downloader=None):
1662 InfoExtractor.__init__(self, downloader)
1664 def report_download_page(self, query, pagenum):
1665 """Report attempt to download playlist page with given number."""
1666 query = query.decode(preferredencoding())
1667 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1669 def _real_extract(self, query):
1670 mobj = re.match(self._VALID_URL, query)
1672 self._downloader.report_error(u'invalid search query "%s"' % query)
1675 prefix, query = query.split(':')
1677 query = query.encode('utf-8')
1679 self._download_n_results(query, 1)
1681 elif prefix == 'all':
1682 self._download_n_results(query, self._max_yahoo_results)
1688 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1690 elif n > self._max_yahoo_results:
1691 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1692 n = self._max_yahoo_results
1693 self._download_n_results(query, n)
1695 except ValueError: # parsing prefix as integer fails
1696 self._download_n_results(query, 1)
1699 def _download_n_results(self, query, n):
1700 """Downloads a specified number of results for a query"""
1703 already_seen = set()
1707 self.report_download_page(query, pagenum)
1708 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1709 request = compat_urllib_request.Request(result_url)
1711 page = compat_urllib_request.urlopen(request).read()
1712 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1713 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1716 # Extract video identifiers
1717 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1718 video_id = mobj.group(1)
1719 if video_id not in already_seen:
1720 video_ids.append(video_id)
1721 already_seen.add(video_id)
1722 if len(video_ids) == n:
1723 # Specified n videos reached
1724 for id in video_ids:
1725 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1728 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1729 for id in video_ids:
1730 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1733 pagenum = pagenum + 1
1736 class YoutubePlaylistIE(InfoExtractor):
1737 """Information Extractor for YouTube playlists."""
1739 _VALID_URL = r"""(?:
1744 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1745 \? (?:.*?&)*? (?:p|a|list)=
1748 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1751 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1753 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1755 IE_NAME = u'youtube:playlist'
1757 def __init__(self, downloader=None):
1758 InfoExtractor.__init__(self, downloader)
1761 def suitable(cls, url):
1762 """Receives a URL and returns True if suitable for this IE."""
1763 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1765 def report_download_page(self, playlist_id, pagenum):
1766 """Report attempt to download playlist page with given number."""
1767 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1769 def _real_extract(self, url):
1770 # Extract playlist id
1771 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1773 self._downloader.report_error(u'invalid url: %s' % url)
1776 # Download playlist videos from API
1777 playlist_id = mobj.group(1) or mobj.group(2)
1782 self.report_download_page(playlist_id, page_num)
1784 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1786 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1787 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1788 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1792 response = json.loads(page)
1793 except ValueError as err:
1794 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1797 if 'feed' not in response:
1798 self._downloader.report_error(u'Got a malformed response from YouTube API')
1800 if 'entry' not in response['feed']:
1801 # Number of videos is a multiple of self._MAX_RESULTS
1804 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1805 for entry in response['feed']['entry']
1806 if 'content' in entry ]
1808 if len(response['feed']['entry']) < self._MAX_RESULTS:
1812 videos = [v[1] for v in sorted(videos)]
1814 url_results = [self.url_result(url, 'Youtube') for url in videos]
1815 return [self.playlist_result(url_results, playlist_id)]
1818 class YoutubeChannelIE(InfoExtractor):
1819 """Information Extractor for YouTube channels."""
1821 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1822 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1823 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1824 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1825 IE_NAME = u'youtube:channel'
1827 def report_download_page(self, channel_id, pagenum):
1828 """Report attempt to download channel page with given number."""
1829 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1831 def extract_videos_from_page(self, page):
1833 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1834 if mobj.group(1) not in ids_in_page:
1835 ids_in_page.append(mobj.group(1))
1838 def _real_extract(self, url):
1839 # Extract channel id
1840 mobj = re.match(self._VALID_URL, url)
1842 self._downloader.report_error(u'invalid url: %s' % url)
1845 # Download channel page
1846 channel_id = mobj.group(1)
1850 self.report_download_page(channel_id, pagenum)
1851 url = self._TEMPLATE_URL % (channel_id, pagenum)
1852 request = compat_urllib_request.Request(url)
1854 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1855 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1856 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1859 # Extract video identifiers
1860 ids_in_page = self.extract_videos_from_page(page)
1861 video_ids.extend(ids_in_page)
1863 # Download any subsequent channel pages using the json-based channel_ajax query
1864 if self._MORE_PAGES_INDICATOR in page:
1866 pagenum = pagenum + 1
1868 self.report_download_page(channel_id, pagenum)
1869 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1870 request = compat_urllib_request.Request(url)
1872 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1873 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1874 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1877 page = json.loads(page)
1879 ids_in_page = self.extract_videos_from_page(page['content_html'])
1880 video_ids.extend(ids_in_page)
1882 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1885 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1887 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1888 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1889 return [self.playlist_result(url_entries, channel_id)]
1892 class YoutubeUserIE(InfoExtractor):
1893 """Information Extractor for YouTube users."""
1895 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1896 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1897 _GDATA_PAGE_SIZE = 50
1898 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1899 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1900 IE_NAME = u'youtube:user'
1902 def __init__(self, downloader=None):
1903 InfoExtractor.__init__(self, downloader)
1905 def report_download_page(self, username, start_index):
1906 """Report attempt to download user page."""
1907 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1908 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1910 def _real_extract(self, url):
1912 mobj = re.match(self._VALID_URL, url)
1914 self._downloader.report_error(u'invalid url: %s' % url)
1917 username = mobj.group(1)
1919 # Download video ids using YouTube Data API. Result size per
1920 # query is limited (currently to 50 videos) so we need to query
1921 # page by page until there are no video ids - it means we got
1928 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1929 self.report_download_page(username, start_index)
1931 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1934 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1935 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1936 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1939 # Extract video identifiers
1942 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1943 if mobj.group(1) not in ids_in_page:
1944 ids_in_page.append(mobj.group(1))
1946 video_ids.extend(ids_in_page)
1948 # A little optimization - if current page is not
1949 # "full", ie. does not contain PAGE_SIZE video ids then
1950 # we can assume that this page is the last one - there
1951 # are no more ids on further pages - no need to query
1954 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1959 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1960 url_results = [self.url_result(url, 'Youtube') for url in urls]
1961 return [self.playlist_result(url_results, playlist_title = username)]
1964 class BlipTVUserIE(InfoExtractor):
1965 """Information Extractor for blip.tv users."""
1967 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1969 IE_NAME = u'blip.tv:user'
1971 def __init__(self, downloader=None):
1972 InfoExtractor.__init__(self, downloader)
1974 def report_download_page(self, username, pagenum):
1975 """Report attempt to download user page."""
1976 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1977 (self.IE_NAME, username, pagenum))
1979 def _real_extract(self, url):
1981 mobj = re.match(self._VALID_URL, url)
1983 self._downloader.report_error(u'invalid url: %s' % url)
1986 username = mobj.group(1)
1988 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1990 request = compat_urllib_request.Request(url)
1993 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1994 mobj = re.search(r'data-users-id="([^"]+)"', page)
1995 page_base = page_base % mobj.group(1)
1996 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1997 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2001 # Download video ids using BlipTV Ajax calls. Result size per
2002 # query is limited (currently to 12 videos) so we need to query
2003 # page by page until there are no video ids - it means we got
2010 self.report_download_page(username, pagenum)
2011 url = page_base + "&page=" + str(pagenum)
2012 request = compat_urllib_request.Request( url )
2014 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
2015 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2016 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
2019 # Extract video identifiers
2022 for mobj in re.finditer(r'href="/([^"]+)"', page):
2023 if mobj.group(1) not in ids_in_page:
2024 ids_in_page.append(unescapeHTML(mobj.group(1)))
2026 video_ids.extend(ids_in_page)
2028 # A little optimization - if current page is not
2029 # "full", ie. does not contain PAGE_SIZE video ids then
2030 # we can assume that this page is the last one - there
2031 # are no more ids on further pages - no need to query
2034 if len(ids_in_page) < self._PAGE_SIZE:
2039 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
2040 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
2041 return [self.playlist_result(url_entries, playlist_title = username)]
2044 class DepositFilesIE(InfoExtractor):
2045 """Information extractor for depositfiles.com"""
2047 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2049 def report_download_webpage(self, file_id):
2050 """Report webpage download."""
2051 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2053 def report_extraction(self, file_id):
2054 """Report information extraction."""
2055 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2057 def _real_extract(self, url):
2058 file_id = url.split('/')[-1]
2059 # Rebuild url in english locale
2060 url = 'http://depositfiles.com/en/files/' + file_id
2062 # Retrieve file webpage with 'Free download' button pressed
2063 free_download_indication = { 'gateway_result' : '1' }
2064 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2066 self.report_download_webpage(file_id)
2067 webpage = compat_urllib_request.urlopen(request).read()
2068 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2069 self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2072 # Search for the real file URL
2073 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2074 if (mobj is None) or (mobj.group(1) is None):
2075 # Try to figure out reason of the error.
2076 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2077 if (mobj is not None) and (mobj.group(1) is not None):
2078 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2079 self._downloader.report_error(u'%s' % restriction_message)
2081 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2084 file_url = mobj.group(1)
2085 file_extension = os.path.splitext(file_url)[1][1:]
2087 # Search for file title
2088 mobj = re.search(r'<b title="(.*?)">', webpage)
2090 self._downloader.report_error(u'unable to extract title')
2092 file_title = mobj.group(1).decode('utf-8')
2095 'id': file_id.decode('utf-8'),
2096 'url': file_url.decode('utf-8'),
2098 'upload_date': None,
2099 'title': file_title,
2100 'ext': file_extension.decode('utf-8'),
2104 class FacebookIE(InfoExtractor):
2105 """Information Extractor for Facebook"""
2107 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2108 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2109 _NETRC_MACHINE = 'facebook'
2110 IE_NAME = u'facebook'
2112 def report_login(self):
2113 """Report attempt to log in."""
2114 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2116 def _real_initialize(self):
2117 if self._downloader is None:
2122 downloader_params = self._downloader.params
2124 # Attempt to use provided username and password or .netrc data
2125 if downloader_params.get('username', None) is not None:
2126 useremail = downloader_params['username']
2127 password = downloader_params['password']
2128 elif downloader_params.get('usenetrc', False):
2130 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2131 if info is not None:
2135 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2136 except (IOError, netrc.NetrcParseError) as err:
2137 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2140 if useremail is None:
2149 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2152 login_results = compat_urllib_request.urlopen(request).read()
2153 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2154 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2156 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2157 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2160 def _real_extract(self, url):
2161 mobj = re.match(self._VALID_URL, url)
2163 self._downloader.report_error(u'invalid URL: %s' % url)
2165 video_id = mobj.group('ID')
2167 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2168 webpage = self._download_webpage(url, video_id)
2170 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2171 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2172 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2174 raise ExtractorError(u'Cannot parse data')
2175 data = dict(json.loads(m.group(1)))
2176 params_raw = compat_urllib_parse.unquote(data['params'])
2177 params = json.loads(params_raw)
2178 video_data = params['video_data'][0]
2179 video_url = video_data.get('hd_src')
2181 video_url = video_data['sd_src']
2183 raise ExtractorError(u'Cannot find video URL')
2184 video_duration = int(video_data['video_duration'])
2185 thumbnail = video_data['thumbnail_src']
2187 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2189 raise ExtractorError(u'Cannot find title in webpage')
2190 video_title = unescapeHTML(m.group(1))
2194 'title': video_title,
2197 'duration': video_duration,
2198 'thumbnail': thumbnail,
2203 class BlipTVIE(InfoExtractor):
2204 """Information extractor for blip.tv"""
2206 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2207 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2208 IE_NAME = u'blip.tv'
2210 def report_extraction(self, file_id):
2211 """Report information extraction."""
2212 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2214 def report_direct_download(self, title):
2215 """Report information extraction."""
2216 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2218 def _real_extract(self, url):
2219 mobj = re.match(self._VALID_URL, url)
2221 self._downloader.report_error(u'invalid URL: %s' % url)
2224 urlp = compat_urllib_parse_urlparse(url)
2225 if urlp.path.startswith('/play/'):
2226 request = compat_urllib_request.Request(url)
2227 response = compat_urllib_request.urlopen(request)
2228 redirecturl = response.geturl()
2229 rurlp = compat_urllib_parse_urlparse(redirecturl)
2230 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2231 url = 'http://blip.tv/a/a-' + file_id
2232 return self._real_extract(url)
2239 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2240 request = compat_urllib_request.Request(json_url)
2241 request.add_header('User-Agent', 'iTunes/10.6.1')
2242 self.report_extraction(mobj.group(1))
2245 urlh = compat_urllib_request.urlopen(request)
2246 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2247 basename = url.split('/')[-1]
2248 title,ext = os.path.splitext(basename)
2249 title = title.decode('UTF-8')
2250 ext = ext.replace('.', '')
2251 self.report_direct_download(title)
2256 'upload_date': None,
2261 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2262 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2263 if info is None: # Regular URL
2265 json_code_bytes = urlh.read()
2266 json_code = json_code_bytes.decode('utf-8')
2267 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2268 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2272 json_data = json.loads(json_code)
2273 if 'Post' in json_data:
2274 data = json_data['Post']
2278 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2279 video_url = data['media']['url']
2280 umobj = re.match(self._URL_EXT, video_url)
2282 raise ValueError('Can not determine filename extension')
2283 ext = umobj.group(1)
2286 'id': data['item_id'],
2288 'uploader': data['display_name'],
2289 'upload_date': upload_date,
2290 'title': data['title'],
2292 'format': data['media']['mimeType'],
2293 'thumbnail': data['thumbnailUrl'],
2294 'description': data['description'],
2295 'player_url': data['embedUrl'],
2296 'user_agent': 'iTunes/10.6.1',
2298 except (ValueError,KeyError) as err:
2299 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2305 class MyVideoIE(InfoExtractor):
2306 """Information Extractor for myvideo.de."""
2308 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2309 IE_NAME = u'myvideo'
2311 def __init__(self, downloader=None):
2312 InfoExtractor.__init__(self, downloader)
2314 def report_extraction(self, video_id):
2315 """Report information extraction."""
2316 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2318 def _real_extract(self,url):
2319 mobj = re.match(self._VALID_URL, url)
2321 self._download.report_error(u'invalid URL: %s' % url)
2324 video_id = mobj.group(1)
2327 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2328 webpage = self._download_webpage(webpage_url, video_id)
2330 self.report_extraction(video_id)
2331 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2334 self._downloader.report_error(u'unable to extract media URL')
2336 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2338 mobj = re.search('<title>([^<]+)</title>', webpage)
2340 self._downloader.report_error(u'unable to extract title')
2343 video_title = mobj.group(1)
2349 'upload_date': None,
2350 'title': video_title,
2354 class ComedyCentralIE(InfoExtractor):
2355 """Information extractor for The Daily Show and Colbert Report """
2357 # urls can be abbreviations like :thedailyshow or :colbert
2358 # urls for episodes like:
2359 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2360 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2361 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2362 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2363 |(https?://)?(www\.)?
2364 (?P<showname>thedailyshow|colbertnation)\.com/
2365 (full-episodes/(?P<episode>.*)|
2367 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2368 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2371 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2373 _video_extensions = {
2381 _video_dimensions = {
2391 def suitable(cls, url):
2392 """Receives a URL and returns True if suitable for this IE."""
2393 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2395 def report_extraction(self, episode_id):
2396 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2398 def report_config_download(self, episode_id, media_id):
2399 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2401 def report_index_download(self, episode_id):
2402 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2404 def _print_formats(self, formats):
2405 print('Available formats:')
2407 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2410 def _real_extract(self, url):
2411 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2413 self._downloader.report_error(u'invalid URL: %s' % url)
2416 if mobj.group('shortname'):
2417 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2418 url = u'http://www.thedailyshow.com/full-episodes/'
2420 url = u'http://www.colbertnation.com/full-episodes/'
2421 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2422 assert mobj is not None
2424 if mobj.group('clip'):
2425 if mobj.group('showname') == 'thedailyshow':
2426 epTitle = mobj.group('tdstitle')
2428 epTitle = mobj.group('cntitle')
2431 dlNewest = not mobj.group('episode')
2433 epTitle = mobj.group('showname')
2435 epTitle = mobj.group('episode')
2437 req = compat_urllib_request.Request(url)
2438 self.report_extraction(epTitle)
2440 htmlHandle = compat_urllib_request.urlopen(req)
2441 html = htmlHandle.read()
2442 webpage = html.decode('utf-8')
2443 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2444 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2447 url = htmlHandle.geturl()
2448 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2450 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2452 if mobj.group('episode') == '':
2453 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2455 epTitle = mobj.group('episode')
2457 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2459 if len(mMovieParams) == 0:
2460 # The Colbert Report embeds the information in a without
2461 # a URL prefix; so extract the alternate reference
2462 # and then add the URL prefix manually.
2464 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2465 if len(altMovieParams) == 0:
2466 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2469 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2471 uri = mMovieParams[0][1]
2472 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2473 self.report_index_download(epTitle)
2475 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2476 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2477 self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2482 idoc = xml.etree.ElementTree.fromstring(indexXml)
2483 itemEls = idoc.findall('.//item')
2484 for partNum,itemEl in enumerate(itemEls):
2485 mediaId = itemEl.findall('./guid')[0].text
2486 shortMediaId = mediaId.split(':')[-1]
2487 showId = mediaId.split(':')[-2].replace('.com', '')
2488 officialTitle = itemEl.findall('./title')[0].text
2489 officialDate = itemEl.findall('./pubDate')[0].text
2491 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2492 compat_urllib_parse.urlencode({'uri': mediaId}))
2493 configReq = compat_urllib_request.Request(configUrl)
2494 self.report_config_download(epTitle, shortMediaId)
2496 configXml = compat_urllib_request.urlopen(configReq).read()
2497 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2498 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2501 cdoc = xml.etree.ElementTree.fromstring(configXml)
2503 for rendition in cdoc.findall('.//rendition'):
2504 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2508 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2511 if self._downloader.params.get('listformats', None):
2512 self._print_formats([i[0] for i in turls])
2515 # For now, just pick the highest bitrate
2516 format,rtmp_video_url = turls[-1]
2518 # Get the format arg from the arg stream
2519 req_format = self._downloader.params.get('format', None)
2521 # Select format if we can find one
2524 format, rtmp_video_url = f, v
2527 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2529 raise ExtractorError(u'Cannot transform RTMP url')
2530 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2531 video_url = base + m.group('finalid')
2533 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2538 'upload_date': officialDate,
2543 'description': officialTitle,
2545 results.append(info)
2550 class EscapistIE(InfoExtractor):
2551 """Information extractor for The Escapist """
2553 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2554 IE_NAME = u'escapist'
2556 def report_extraction(self, showName):
2557 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2559 def report_config_download(self, showName):
2560 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2562 def _real_extract(self, url):
2563 mobj = re.match(self._VALID_URL, url)
2565 self._downloader.report_error(u'invalid URL: %s' % url)
2567 showName = mobj.group('showname')
2568 videoId = mobj.group('episode')
2570 self.report_extraction(showName)
2572 webPage = compat_urllib_request.urlopen(url)
2573 webPageBytes = webPage.read()
2574 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2575 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2576 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2577 self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2580 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2581 description = unescapeHTML(descMatch.group(1))
2582 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2583 imgUrl = unescapeHTML(imgMatch.group(1))
2584 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2585 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2586 configUrlMatch = re.search('config=(.*)$', playerUrl)
2587 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2589 self.report_config_download(showName)
2591 configJSON = compat_urllib_request.urlopen(configUrl)
2592 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2593 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2594 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2595 self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2598 # Technically, it's JavaScript, not JSON
2599 configJSON = configJSON.replace("'", '"')
2602 config = json.loads(configJSON)
2603 except (ValueError,) as err:
2604 self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2607 playlist = config['playlist']
2608 videoUrl = playlist[1]['url']
2613 'uploader': showName,
2614 'upload_date': None,
2617 'thumbnail': imgUrl,
2618 'description': description,
2619 'player_url': playerUrl,
2624 class CollegeHumorIE(InfoExtractor):
2625 """Information extractor for collegehumor.com"""
2628 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2629 IE_NAME = u'collegehumor'
2631 def report_manifest(self, video_id):
2632 """Report information extraction."""
2633 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2635 def report_extraction(self, video_id):
2636 """Report information extraction."""
2637 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2639 def _real_extract(self, url):
2640 mobj = re.match(self._VALID_URL, url)
2642 self._downloader.report_error(u'invalid URL: %s' % url)
2644 video_id = mobj.group('videoid')
2649 'upload_date': None,
2652 self.report_extraction(video_id)
2653 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2655 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2656 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2657 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2660 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2662 videoNode = mdoc.findall('./video')[0]
2663 info['description'] = videoNode.findall('./description')[0].text
2664 info['title'] = videoNode.findall('./caption')[0].text
2665 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2666 manifest_url = videoNode.findall('./file')[0].text
2668 self._downloader.report_error(u'Invalid metadata XML file')
2671 manifest_url += '?hdcore=2.10.3'
2672 self.report_manifest(video_id)
2674 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2675 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2676 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2679 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2681 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2682 node_id = media_node.attrib['url']
2683 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2684 except IndexError as err:
2685 self._downloader.report_error(u'Invalid manifest file')
2688 url_pr = compat_urllib_parse_urlparse(manifest_url)
2689 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2696 class XVideosIE(InfoExtractor):
2697 """Information extractor for xvideos.com"""
2699 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2700 IE_NAME = u'xvideos'
2702 def report_extraction(self, video_id):
2703 """Report information extraction."""
2704 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2706 def _real_extract(self, url):
2707 mobj = re.match(self._VALID_URL, url)
2709 self._downloader.report_error(u'invalid URL: %s' % url)
2711 video_id = mobj.group(1)
2713 webpage = self._download_webpage(url, video_id)
2715 self.report_extraction(video_id)
2719 mobj = re.search(r'flv_url=(.+?)&', webpage)
2721 self._downloader.report_error(u'unable to extract video url')
2723 video_url = compat_urllib_parse.unquote(mobj.group(1))
2727 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2729 self._downloader.report_error(u'unable to extract video title')
2731 video_title = mobj.group(1)
2734 # Extract video thumbnail
2735 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2737 self._downloader.report_error(u'unable to extract video thumbnail')
2739 video_thumbnail = mobj.group(0)
2745 'upload_date': None,
2746 'title': video_title,
2748 'thumbnail': video_thumbnail,
2749 'description': None,
2755 class SoundcloudIE(InfoExtractor):
2756 """Information extractor for soundcloud.com
2757 To access the media, the uid of the song and a stream token
2758 must be extracted from the page source and the script must make
2759 a request to media.soundcloud.com/crossdomain.xml. Then
2760 the media can be grabbed by requesting from an url composed
2761 of the stream token and uid
2764 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2765 IE_NAME = u'soundcloud'
2767 def __init__(self, downloader=None):
2768 InfoExtractor.__init__(self, downloader)
2770 def report_resolve(self, video_id):
2771 """Report information extraction."""
2772 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2774 def report_extraction(self, video_id):
2775 """Report information extraction."""
2776 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2778 def _real_extract(self, url):
2779 mobj = re.match(self._VALID_URL, url)
2781 self._downloader.report_error(u'invalid URL: %s' % url)
2784 # extract uploader (which is in the url)
2785 uploader = mobj.group(1)
2786 # extract simple title (uploader + slug of song title)
2787 slug_title = mobj.group(2)
2788 simple_title = uploader + u'-' + slug_title
2790 self.report_resolve('%s/%s' % (uploader, slug_title))
2792 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2793 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2794 request = compat_urllib_request.Request(resolv_url)
2796 info_json_bytes = compat_urllib_request.urlopen(request).read()
2797 info_json = info_json_bytes.decode('utf-8')
2798 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2799 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2802 info = json.loads(info_json)
2803 video_id = info['id']
2804 self.report_extraction('%s/%s' % (uploader, slug_title))
2806 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2807 request = compat_urllib_request.Request(streams_url)
2809 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2810 stream_json = stream_json_bytes.decode('utf-8')
2811 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2812 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2815 streams = json.loads(stream_json)
2816 mediaURL = streams['http_mp3_128_url']
2821 'uploader': info['user']['username'],
2822 'upload_date': info['created_at'],
2823 'title': info['title'],
2825 'description': info['description'],
2828 class SoundcloudSetIE(InfoExtractor):
2829 """Information extractor for soundcloud.com sets
2830 To access the media, the uid of the song and a stream token
2831 must be extracted from the page source and the script must make
2832 a request to media.soundcloud.com/crossdomain.xml. Then
2833 the media can be grabbed by requesting from an url composed
2834 of the stream token and uid
2837 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2838 IE_NAME = u'soundcloud'
2840 def __init__(self, downloader=None):
2841 InfoExtractor.__init__(self, downloader)
2843 def report_resolve(self, video_id):
2844 """Report information extraction."""
2845 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2847 def report_extraction(self, video_id):
2848 """Report information extraction."""
2849 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2851 def _real_extract(self, url):
2852 mobj = re.match(self._VALID_URL, url)
2854 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2857 # extract uploader (which is in the url)
2858 uploader = mobj.group(1)
2859 # extract simple title (uploader + slug of song title)
2860 slug_title = mobj.group(2)
2861 simple_title = uploader + u'-' + slug_title
2863 self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2865 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2866 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2867 request = compat_urllib_request.Request(resolv_url)
2869 info_json_bytes = compat_urllib_request.urlopen(request).read()
2870 info_json = info_json_bytes.decode('utf-8')
2871 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2872 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2876 info = json.loads(info_json)
2877 if 'errors' in info:
2878 for err in info['errors']:
2879 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2882 for track in info['tracks']:
2883 video_id = track['id']
2884 self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2886 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2887 request = compat_urllib_request.Request(streams_url)
2889 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2890 stream_json = stream_json_bytes.decode('utf-8')
2891 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2892 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2895 streams = json.loads(stream_json)
2896 mediaURL = streams['http_mp3_128_url']
2901 'uploader': track['user']['username'],
2902 'upload_date': track['created_at'],
2903 'title': track['title'],
2905 'description': track['description'],
2910 class InfoQIE(InfoExtractor):
2911 """Information extractor for infoq.com"""
2912 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2914 def report_extraction(self, video_id):
2915 """Report information extraction."""
2916 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2918 def _real_extract(self, url):
2919 mobj = re.match(self._VALID_URL, url)
2921 self._downloader.report_error(u'invalid URL: %s' % url)
2924 webpage = self._download_webpage(url, video_id=url)
2925 self.report_extraction(url)
2928 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2930 self._downloader.report_error(u'unable to extract video url')
2932 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2933 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2936 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2938 self._downloader.report_error(u'unable to extract video title')
2940 video_title = mobj.group(1)
2942 # Extract description
2943 video_description = u'No description available.'
2944 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2945 if mobj is not None:
2946 video_description = mobj.group(1)
2948 video_filename = video_url.split('/')[-1]
2949 video_id, extension = video_filename.split('.')
2955 'upload_date': None,
2956 'title': video_title,
2957 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2959 'description': video_description,
2964 class MixcloudIE(InfoExtractor):
2965 """Information extractor for www.mixcloud.com"""
2967 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2968 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2969 IE_NAME = u'mixcloud'
2971 def __init__(self, downloader=None):
2972 InfoExtractor.__init__(self, downloader)
2974 def report_download_json(self, file_id):
2975 """Report JSON download."""
2976 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2978 def report_extraction(self, file_id):
2979 """Report information extraction."""
2980 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2982 def get_urls(self, jsonData, fmt, bitrate='best'):
2983 """Get urls from 'audio_formats' section in json"""
2986 bitrate_list = jsonData[fmt]
2987 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2988 bitrate = max(bitrate_list) # select highest
2990 url_list = jsonData[fmt][bitrate]
2991 except TypeError: # we have no bitrate info.
2992 url_list = jsonData[fmt]
2995 def check_urls(self, url_list):
2996 """Returns 1st active url from list"""
2997 for url in url_list:
2999 compat_urllib_request.urlopen(url)
3001 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3006 def _print_formats(self, formats):
3007 print('Available formats:')
3008 for fmt in formats.keys():
3009 for b in formats[fmt]:
3011 ext = formats[fmt][b][0]
3012 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
3013 except TypeError: # we have no bitrate info
3014 ext = formats[fmt][0]
3015 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3018 def _real_extract(self, url):
3019 mobj = re.match(self._VALID_URL, url)
3021 self._downloader.report_error(u'invalid URL: %s' % url)
3023 # extract uploader & filename from url
3024 uploader = mobj.group(1).decode('utf-8')
3025 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3027 # construct API request
3028 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3029 # retrieve .json file with links to files
3030 request = compat_urllib_request.Request(file_url)
3032 self.report_download_json(file_url)
3033 jsonData = compat_urllib_request.urlopen(request).read()
3034 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3035 self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
3039 json_data = json.loads(jsonData)
3040 player_url = json_data['player_swf_url']
3041 formats = dict(json_data['audio_formats'])
3043 req_format = self._downloader.params.get('format', None)
3046 if self._downloader.params.get('listformats', None):
3047 self._print_formats(formats)
3050 if req_format is None or req_format == 'best':
3051 for format_param in formats.keys():
3052 url_list = self.get_urls(formats, format_param)
3054 file_url = self.check_urls(url_list)
3055 if file_url is not None:
3058 if req_format not in formats:
3059 self._downloader.report_error(u'format is not available')
3062 url_list = self.get_urls(formats, req_format)
3063 file_url = self.check_urls(url_list)
3064 format_param = req_format
3067 'id': file_id.decode('utf-8'),
3068 'url': file_url.decode('utf-8'),
3069 'uploader': uploader.decode('utf-8'),
3070 'upload_date': None,
3071 'title': json_data['name'],
3072 'ext': file_url.split('.')[-1].decode('utf-8'),
3073 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3074 'thumbnail': json_data['thumbnail_url'],
3075 'description': json_data['description'],
3076 'player_url': player_url.decode('utf-8'),
3079 class StanfordOpenClassroomIE(InfoExtractor):
3080 """Information extractor for Stanford's Open ClassRoom"""
3082 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3083 IE_NAME = u'stanfordoc'
3085 def report_download_webpage(self, objid):
3086 """Report information extraction."""
3087 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3089 def report_extraction(self, video_id):
3090 """Report information extraction."""
3091 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3093 def _real_extract(self, url):
3094 mobj = re.match(self._VALID_URL, url)
3096 raise ExtractorError(u'Invalid URL: %s' % url)
3098 if mobj.group('course') and mobj.group('video'): # A specific video
3099 course = mobj.group('course')
3100 video = mobj.group('video')
3102 'id': course + '_' + video,
3104 'upload_date': None,
3107 self.report_extraction(info['id'])
3108 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3109 xmlUrl = baseUrl + video + '.xml'
3111 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3112 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3113 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3115 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3117 info['title'] = mdoc.findall('./title')[0].text
3118 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3120 self._downloader.report_error(u'Invalid metadata XML file')
3122 info['ext'] = info['url'].rpartition('.')[2]
3124 elif mobj.group('course'): # A course page
3125 course = mobj.group('course')
3130 'upload_date': None,
3133 coursepage = self._download_webpage(url, info['id'],
3134 note='Downloading course info page',
3135 errnote='Unable to download course info page')
3137 m = re.search('<h1>([^<]+)</h1>', coursepage)
3139 info['title'] = unescapeHTML(m.group(1))
3141 info['title'] = info['id']
3143 m = re.search('<description>([^<]+)</description>', coursepage)
3145 info['description'] = unescapeHTML(m.group(1))
3147 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3150 'type': 'reference',
3151 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3155 for entry in info['list']:
3156 assert entry['type'] == 'reference'
3157 results += self.extract(entry['url'])
3161 'id': 'Stanford OpenClassroom',
3164 'upload_date': None,
3167 self.report_download_webpage(info['id'])
3168 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3170 rootpage = compat_urllib_request.urlopen(rootURL).read()
3171 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3172 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3175 info['title'] = info['id']
3177 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3180 'type': 'reference',
3181 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3186 for entry in info['list']:
3187 assert entry['type'] == 'reference'
3188 results += self.extract(entry['url'])
3191 class MTVIE(InfoExtractor):
3192 """Information extractor for MTV.com"""
3194 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3197 def report_extraction(self, video_id):
3198 """Report information extraction."""
3199 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3201 def _real_extract(self, url):
3202 mobj = re.match(self._VALID_URL, url)
3204 self._downloader.report_error(u'invalid URL: %s' % url)
3206 if not mobj.group('proto'):
3207 url = 'http://' + url
3208 video_id = mobj.group('videoid')
3210 webpage = self._download_webpage(url, video_id)
3212 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3214 self._downloader.report_error(u'unable to extract song name')
3216 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3217 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3219 self._downloader.report_error(u'unable to extract performer')
3221 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3222 video_title = performer + ' - ' + song_name
3224 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3226 self._downloader.report_error(u'unable to mtvn_uri')
3228 mtvn_uri = mobj.group(1)
3230 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3232 self._downloader.report_error(u'unable to extract content id')
3234 content_id = mobj.group(1)
3236 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3237 self.report_extraction(video_id)
3238 request = compat_urllib_request.Request(videogen_url)
3240 metadataXml = compat_urllib_request.urlopen(request).read()
3241 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3242 self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3245 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3246 renditions = mdoc.findall('.//rendition')
3248 # For now, always pick the highest quality.
3249 rendition = renditions[-1]
3252 _,_,ext = rendition.attrib['type'].partition('/')
3253 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3254 video_url = rendition.find('./src').text
3256 self._downloader.trouble('Invalid rendition field.')
3262 'uploader': performer,
3263 'upload_date': None,
3264 'title': video_title,
3272 class YoukuIE(InfoExtractor):
3273 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3275 def report_download_webpage(self, file_id):
3276 """Report webpage download."""
3277 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3279 def report_extraction(self, file_id):
3280 """Report information extraction."""
3281 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3284 nowTime = int(time.time() * 1000)
3285 random1 = random.randint(1000,1998)
3286 random2 = random.randint(1000,9999)
3288 return "%d%d%d" %(nowTime,random1,random2)
3290 def _get_file_ID_mix_string(self, seed):
3292 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3294 for i in range(len(source)):
3295 seed = (seed * 211 + 30031 ) % 65536
3296 index = math.floor(seed / 65536 * len(source) )
3297 mixed.append(source[int(index)])
3298 source.remove(source[int(index)])
3299 #return ''.join(mixed)
3302 def _get_file_id(self, fileId, seed):
3303 mixed = self._get_file_ID_mix_string(seed)
3304 ids = fileId.split('*')
3308 realId.append(mixed[int(ch)])
3309 return ''.join(realId)
3311 def _real_extract(self, url):
3312 mobj = re.match(self._VALID_URL, url)
3314 self._downloader.report_error(u'invalid URL: %s' % url)
3316 video_id = mobj.group('ID')
3318 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3320 request = compat_urllib_request.Request(info_url, None, std_headers)
3322 self.report_download_webpage(video_id)
3323 jsondata = compat_urllib_request.urlopen(request).read()
3324 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3325 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3328 self.report_extraction(video_id)
3330 jsonstr = jsondata.decode('utf-8')
3331 config = json.loads(jsonstr)
3333 video_title = config['data'][0]['title']
3334 seed = config['data'][0]['seed']
3336 format = self._downloader.params.get('format', None)
3337 supported_format = list(config['data'][0]['streamfileids'].keys())
3339 if format is None or format == 'best':
3340 if 'hd2' in supported_format:
3345 elif format == 'worst':
3353 fileid = config['data'][0]['streamfileids'][format]
3354 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3355 except (UnicodeDecodeError, ValueError, KeyError):
3356 self._downloader.report_error(u'unable to extract info section')
3360 sid = self._gen_sid()
3361 fileid = self._get_file_id(fileid, seed)
3363 #column 8,9 of fileid represent the segment number
3364 #fileid[7:9] should be changed
3365 for index, key in enumerate(keys):
3367 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3368 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3371 'id': '%s_part%02d' % (video_id, index),
3372 'url': download_url,
3374 'upload_date': None,
3375 'title': video_title,
3378 files_info.append(info)
3383 class XNXXIE(InfoExtractor):
3384 """Information extractor for xnxx.com"""
3386 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3388 VIDEO_URL_RE = r'flv_url=(.*?)&'
3389 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3390 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3392 def report_webpage(self, video_id):
3393 """Report information extraction"""
3394 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3396 def report_extraction(self, video_id):
3397 """Report information extraction"""
3398 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3400 def _real_extract(self, url):
3401 mobj = re.match(self._VALID_URL, url)
3403 self._downloader.report_error(u'invalid URL: %s' % url)
3405 video_id = mobj.group(1)
3407 self.report_webpage(video_id)
3409 # Get webpage content
3411 webpage_bytes = compat_urllib_request.urlopen(url).read()
3412 webpage = webpage_bytes.decode('utf-8')
3413 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3414 self._downloader.report_error(u'unable to download video webpage: %s' % err)
3417 result = re.search(self.VIDEO_URL_RE, webpage)
3419 self._downloader.report_error(u'unable to extract video url')
3421 video_url = compat_urllib_parse.unquote(result.group(1))
3423 result = re.search(self.VIDEO_TITLE_RE, webpage)
3425 self._downloader.report_error(u'unable to extract video title')
3427 video_title = result.group(1)
3429 result = re.search(self.VIDEO_THUMB_RE, webpage)
3431 self._downloader.report_error(u'unable to extract video thumbnail')
3433 video_thumbnail = result.group(1)
3439 'upload_date': None,
3440 'title': video_title,
3442 'thumbnail': video_thumbnail,
3443 'description': None,
3447 class GooglePlusIE(InfoExtractor):
3448 """Information extractor for plus.google.com."""
3450 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3451 IE_NAME = u'plus.google'
3453 def __init__(self, downloader=None):
3454 InfoExtractor.__init__(self, downloader)
3456 def report_extract_entry(self, url):
3457 """Report downloading extry"""
3458 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3460 def report_date(self, upload_date):
3461 """Report downloading extry"""
3462 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3464 def report_uploader(self, uploader):
3465 """Report downloading extry"""
3466 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3468 def report_title(self, video_title):
3469 """Report downloading extry"""
3470 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3472 def report_extract_vid_page(self, video_page):
3473 """Report information extraction."""
3474 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3476 def _real_extract(self, url):
3477 # Extract id from URL
3478 mobj = re.match(self._VALID_URL, url)
3480 self._downloader.report_error(u'Invalid URL: %s' % url)
3483 post_url = mobj.group(0)
3484 video_id = mobj.group(1)
3486 video_extension = 'flv'
3488 # Step 1, Retrieve post webpage to extract further information
3489 self.report_extract_entry(post_url)
3490 request = compat_urllib_request.Request(post_url)
3492 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3493 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3494 self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3497 # Extract update date
3499 pattern = 'title="Timestamp">(.*?)</a>'
3500 mobj = re.search(pattern, webpage)
3502 upload_date = mobj.group(1)
3503 # Convert timestring to a format suitable for filename
3504 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3505 upload_date = upload_date.strftime('%Y%m%d')
3506 self.report_date(upload_date)
3510 pattern = r'rel\="author".*?>(.*?)</a>'
3511 mobj = re.search(pattern, webpage)
3513 uploader = mobj.group(1)
3514 self.report_uploader(uploader)
3517 # Get the first line for title
3519 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3520 mobj = re.search(pattern, webpage)
3522 video_title = mobj.group(1)
3523 self.report_title(video_title)
3525 # Step 2, Stimulate clicking the image box to launch video
3526 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3527 mobj = re.search(pattern, webpage)
3529 self._downloader.report_error(u'unable to extract video page URL')
3531 video_page = mobj.group(1)
3532 request = compat_urllib_request.Request(video_page)
3534 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3535 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3536 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3538 self.report_extract_vid_page(video_page)
3541 # Extract video links on video page
3542 """Extract video links of all sizes"""
3543 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3544 mobj = re.findall(pattern, webpage)
3546 self._downloader.report_error(u'unable to extract video links')
3548 # Sort in resolution
3549 links = sorted(mobj)
3551 # Choose the lowest of the sort, i.e. highest resolution
3552 video_url = links[-1]
3553 # Only get the url. The resolution part in the tuple has no use anymore
3554 video_url = video_url[-1]
3555 # Treat escaped \u0026 style hex
3557 video_url = video_url.decode("unicode_escape")
3558 except AttributeError: # Python 3
3559 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3565 'uploader': uploader,
3566 'upload_date': upload_date,
3567 'title': video_title,
3568 'ext': video_extension,
3571 class NBAIE(InfoExtractor):
3572 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3575 def _real_extract(self, url):
3576 mobj = re.match(self._VALID_URL, url)
3578 self._downloader.report_error(u'invalid URL: %s' % url)
3581 video_id = mobj.group(1)
3582 if video_id.endswith('/index.html'):
3583 video_id = video_id[:-len('/index.html')]
3585 webpage = self._download_webpage(url, video_id)
3587 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3588 def _findProp(rexp, default=None):
3589 m = re.search(rexp, webpage)
3591 return unescapeHTML(m.group(1))
3595 shortened_video_id = video_id.rpartition('/')[2]
3596 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3598 'id': shortened_video_id,
3602 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3603 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3607 class JustinTVIE(InfoExtractor):
3608 """Information extractor for justin.tv and twitch.tv"""
3609 # TODO: One broadcast may be split into multiple videos. The key
3610 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3611 # starts at 1 and increases. Can we treat all parts as one video?
3613 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3614 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3615 _JUSTIN_PAGE_LIMIT = 100
3616 IE_NAME = u'justin.tv'
3618 def report_extraction(self, file_id):
3619 """Report information extraction."""
3620 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3622 def report_download_page(self, channel, offset):
3623 """Report attempt to download a single page of videos."""
3624 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3625 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3627 # Return count of items, list of *valid* items
3628 def _parse_page(self, url):
3630 urlh = compat_urllib_request.urlopen(url)
3631 webpage_bytes = urlh.read()
3632 webpage = webpage_bytes.decode('utf-8', 'ignore')
3633 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3634 self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3637 response = json.loads(webpage)
3638 if type(response) != list:
3639 error_text = response.get('error', 'unknown error')
3640 self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3643 for clip in response:
3644 video_url = clip['video_file_url']
3646 video_extension = os.path.splitext(video_url)[1][1:]
3647 video_date = re.sub('-', '', clip['start_time'][:10])
3648 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3649 video_id = clip['id']
3650 video_title = clip.get('title', video_id)
3654 'title': video_title,
3655 'uploader': clip.get('channel_name', video_uploader_id),
3656 'uploader_id': video_uploader_id,
3657 'upload_date': video_date,
3658 'ext': video_extension,
3660 return (len(response), info)
3662 def _real_extract(self, url):
3663 mobj = re.match(self._VALID_URL, url)
3665 self._downloader.report_error(u'invalid URL: %s' % url)
3668 api = 'http://api.justin.tv'
3669 video_id = mobj.group(mobj.lastindex)
3671 if mobj.lastindex == 1:
3673 api += '/channel/archives/%s.json'
3675 api += '/broadcast/by_archive/%s.json'
3676 api = api % (video_id,)
3678 self.report_extraction(video_id)
3682 limit = self._JUSTIN_PAGE_LIMIT
3685 self.report_download_page(video_id, offset)
3686 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3687 page_count, page_info = self._parse_page(page_url)
3688 info.extend(page_info)
3689 if not paged or page_count != limit:
3694 class FunnyOrDieIE(InfoExtractor):
3695 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3697 def _real_extract(self, url):
3698 mobj = re.match(self._VALID_URL, url)
3700 self._downloader.report_error(u'invalid URL: %s' % url)
3703 video_id = mobj.group('id')
3704 webpage = self._download_webpage(url, video_id)
3706 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3708 self._downloader.report_error(u'unable to find video information')
3709 video_url = unescapeHTML(m.group('url'))
3711 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3713 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3715 self._downloader.trouble(u'Cannot find video title')
3716 title = clean_html(m.group('title'))
3718 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3720 desc = unescapeHTML(m.group('desc'))
3729 'description': desc,
3733 class SteamIE(InfoExtractor):
3734 _VALID_URL = r"""http://store.steampowered.com/
3735 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3737 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3741 def suitable(cls, url):
3742 """Receives a URL and returns True if suitable for this IE."""
3743 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3745 def _real_extract(self, url):
3746 m = re.match(self._VALID_URL, url, re.VERBOSE)
3747 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3748 gameID = m.group('gameID')
3749 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3750 webpage = self._download_webpage(videourl, gameID)
3751 mweb = re.finditer(urlRE, webpage)
3752 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3753 titles = re.finditer(namesRE, webpage)
3754 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3755 thumbs = re.finditer(thumbsRE, webpage)
3757 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3758 video_id = vid.group('videoID')
3759 title = vtitle.group('videoName')
3760 video_url = vid.group('videoURL')
3761 video_thumb = thumb.group('thumbnail')
3763 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3768 'title': unescapeHTML(title),
3769 'thumbnail': video_thumb
3774 class UstreamIE(InfoExtractor):
3775 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3776 IE_NAME = u'ustream'
3778 def _real_extract(self, url):
3779 m = re.match(self._VALID_URL, url)
3780 video_id = m.group('videoID')
3781 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3782 webpage = self._download_webpage(url, video_id)
3783 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3784 title = m.group('title')
3785 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3786 uploader = m.group('uploader')
3792 'uploader': uploader
3796 class WorldStarHipHopIE(InfoExtractor):
3797 _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3798 IE_NAME = u'WorldStarHipHop'
3800 def _real_extract(self, url):
3801 _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3803 webpage_src = compat_urllib_request.urlopen(url).read()
3804 webpage_src = webpage_src.decode('utf-8')
3806 mobj = re.search(_src_url, webpage_src)
3808 m = re.match(self._VALID_URL, url)
3809 video_id = m.group('id')
3811 if mobj is not None:
3812 video_url = mobj.group()
3813 if 'mp4' in video_url:
3818 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3821 _title = r"""<title>(.*)</title>"""
3823 mobj = re.search(_title, webpage_src)
3825 if mobj is not None:
3826 title = mobj.group(1)
3828 title = 'World Start Hip Hop - %s' % time.ctime()
3830 _thumbnail = r"""rel="image_src" href="(.*)" />"""
3831 mobj = re.search(_thumbnail, webpage_src)
3833 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3834 if mobj is not None:
3835 thumbnail = mobj.group(1)
3837 _title = r"""candytitles.*>(.*)</span>"""
3838 mobj = re.search(_title, webpage_src)
3839 if mobj is not None:
3840 title = mobj.group(1)
3847 'thumbnail' : thumbnail,
3852 class RBMARadioIE(InfoExtractor):
3853 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3855 def _real_extract(self, url):
3856 m = re.match(self._VALID_URL, url)
3857 video_id = m.group('videoID')
3859 webpage = self._download_webpage(url, video_id)
3860 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3862 raise ExtractorError(u'Cannot find metadata')
3863 json_data = m.group(1)
3866 data = json.loads(json_data)
3867 except ValueError as e:
3868 raise ExtractorError(u'Invalid JSON: ' + str(e))
3870 video_url = data['akamai_url'] + '&cbr=256'
3871 url_parts = compat_urllib_parse_urlparse(video_url)
3872 video_ext = url_parts.path.rpartition('.')[2]
3877 'title': data['title'],
3878 'description': data.get('teaser_text'),
3879 'location': data.get('country_of_origin'),
3880 'uploader': data.get('host', {}).get('name'),
3881 'uploader_id': data.get('host', {}).get('slug'),
3882 'thumbnail': data.get('image', {}).get('large_url_2x'),
3883 'duration': data.get('duration'),
3888 class YouPornIE(InfoExtractor):
3889 """Information extractor for youporn.com."""
3890 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3892 def _print_formats(self, formats):
3893 """Print all available formats"""
3894 print(u'Available formats:')
3895 print(u'ext\t\tformat')
3896 print(u'---------------------------------')
3897 for format in formats:
3898 print(u'%s\t\t%s' % (format['ext'], format['format']))
3900 def _specific(self, req_format, formats):
3902 if(x["format"]==req_format):
3906 def _real_extract(self, url):
3907 mobj = re.match(self._VALID_URL, url)
3909 self._downloader.report_error(u'invalid URL: %s' % url)
3912 video_id = mobj.group('videoid')
3914 req = compat_urllib_request.Request(url)
3915 req.add_header('Cookie', 'age_verified=1')
3916 webpage = self._download_webpage(req, video_id)
3918 # Get the video title
3919 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3921 raise ExtractorError(u'Unable to extract video title')
3922 video_title = result.group('title').strip()
3924 # Get the video date
3925 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3927 self._downloader.report_warning(u'unable to extract video date')
3930 upload_date = result.group('date').strip()
3932 # Get the video uploader
3933 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3935 self._downloader.report_warning(u'unable to extract uploader')
3936 video_uploader = None
3938 video_uploader = result.group('uploader').strip()
3939 video_uploader = clean_html( video_uploader )
3941 # Get all of the formats available
3942 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3943 result = re.search(DOWNLOAD_LIST_RE, webpage)
3945 raise ExtractorError(u'Unable to extract download list')
3946 download_list_html = result.group('download_list').strip()
3948 # Get all of the links from the page
3949 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3950 links = re.findall(LINK_RE, download_list_html)
3951 if(len(links) == 0):
3952 raise ExtractorError(u'ERROR: no known formats available for video')
3954 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3959 # A link looks like this:
3960 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3961 # A path looks like this:
3962 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3963 video_url = unescapeHTML( link )
3964 path = compat_urllib_parse_urlparse( video_url ).path
3965 extension = os.path.splitext( path )[1][1:]
3966 format = path.split('/')[4].split('_')[:2]
3969 format = "-".join( format )
3970 title = u'%s-%s-%s' % (video_title, size, bitrate)
3975 'uploader': video_uploader,
3976 'upload_date': upload_date,
3981 'description': None,
3985 if self._downloader.params.get('listformats', None):
3986 self._print_formats(formats)
3989 req_format = self._downloader.params.get('format', None)
3990 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3992 if req_format is None or req_format == 'best':
3994 elif req_format == 'worst':
3995 return [formats[-1]]
3996 elif req_format in ('-1', 'all'):
3999 format = self._specific( req_format, formats )
4001 self._downloader.report_error(u'requested format not available')
4007 class PornotubeIE(InfoExtractor):
4008 """Information extractor for pornotube.com."""
4009 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
4011 def _real_extract(self, url):
4012 mobj = re.match(self._VALID_URL, url)
4014 self._downloader.report_error(u'invalid URL: %s' % url)
4017 video_id = mobj.group('videoid')
4018 video_title = mobj.group('title')
4020 # Get webpage content
4021 webpage = self._download_webpage(url, video_id)
4024 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
4025 result = re.search(VIDEO_URL_RE, webpage)
4027 self._downloader.report_error(u'unable to extract video url')
4029 video_url = compat_urllib_parse.unquote(result.group('url'))
4031 #Get the uploaded date
4032 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4033 result = re.search(VIDEO_UPLOADED_RE, webpage)
4035 self._downloader.report_error(u'unable to extract video title')
4037 upload_date = result.group('date')
4039 info = {'id': video_id,
4042 'upload_date': upload_date,
4043 'title': video_title,
4049 class YouJizzIE(InfoExtractor):
4050 """Information extractor for youjizz.com."""
4051 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
4053 def _real_extract(self, url):
4054 mobj = re.match(self._VALID_URL, url)
4056 self._downloader.report_error(u'invalid URL: %s' % url)
4059 video_id = mobj.group('videoid')
4061 # Get webpage content
4062 webpage = self._download_webpage(url, video_id)
4064 # Get the video title
4065 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
4067 raise ExtractorError(u'ERROR: unable to extract video title')
4068 video_title = result.group('title').strip()
4070 # Get the embed page
4071 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
4073 raise ExtractorError(u'ERROR: unable to extract embed page')
4075 embed_page_url = result.group(0).strip()
4076 video_id = result.group('videoid')
4078 webpage = self._download_webpage(embed_page_url, video_id)
4081 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
4083 raise ExtractorError(u'ERROR: unable to extract video url')
4084 video_url = result.group('source')
4086 info = {'id': video_id,
4088 'title': video_title,
4091 'player_url': embed_page_url}
4095 class EightTracksIE(InfoExtractor):
4097 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4099 def _real_extract(self, url):
4100 mobj = re.match(self._VALID_URL, url)
4102 raise ExtractorError(u'Invalid URL: %s' % url)
4103 playlist_id = mobj.group('id')
4105 webpage = self._download_webpage(url, playlist_id)
4107 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4109 raise ExtractorError(u'Cannot find trax information')
4110 json_like = m.group(1)
4111 data = json.loads(json_like)
4113 session = str(random.randint(0, 1000000000))
4115 track_count = data['tracks_count']
4116 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4117 next_url = first_url
4119 for i in itertools.count():
4120 api_json = self._download_webpage(next_url, playlist_id,
4121 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4122 errnote=u'Failed to download song information')
4123 api_data = json.loads(api_json)
4124 track_data = api_data[u'set']['track']
4126 'id': track_data['id'],
4127 'url': track_data['track_file_stream_url'],
4128 'title': track_data['performer'] + u' - ' + track_data['name'],
4129 'raw_title': track_data['name'],
4130 'uploader_id': data['user']['login'],
4134 if api_data['set']['at_last_track']:
4136 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4139 class KeekIE(InfoExtractor):
4140 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4143 def _real_extract(self, url):
4144 m = re.match(self._VALID_URL, url)
4145 video_id = m.group('videoID')
4146 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4147 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4148 webpage = self._download_webpage(url, video_id)
4149 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4150 title = unescapeHTML(m.group('title'))
4151 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4152 uploader = clean_html(m.group('uploader'))
4158 'thumbnail': thumbnail,
4159 'uploader': uploader
4163 class TEDIE(InfoExtractor):
4164 _VALID_URL=r'''http://www.ted.com/
4166 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4168 ((?P<type_talk>talks)) # We have a simple talk
4170 /(?P<name>\w+) # Here goes the name and then ".html"
4174 def suitable(cls, url):
4175 """Receives a URL and returns True if suitable for this IE."""
4176 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4178 def _real_extract(self, url):
4179 m=re.match(self._VALID_URL, url, re.VERBOSE)
4180 if m.group('type_talk'):
4181 return [self._talk_info(url)]
4183 playlist_id=m.group('playlist_id')
4184 name=m.group('name')
4185 self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4186 return [self._playlist_videos_info(url,name,playlist_id)]
4188 def _talk_video_link(self,mediaSlug):
4189 '''Returns the video link for that mediaSlug'''
4190 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4192 def _playlist_videos_info(self,url,name,playlist_id=0):
4193 '''Returns the videos of the playlist'''
4195 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4196 ([.\s]*?)data-playlist_item_id="(\d+)"
4197 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4199 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4200 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4201 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4202 m_names=re.finditer(video_name_RE,webpage)
4204 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
4205 m_playlist = re.search(playlist_RE, webpage)
4206 playlist_title = m_playlist.group('playlist_title')
4208 playlist_entries = []
4209 for m_video, m_name in zip(m_videos,m_names):
4210 video_id=m_video.group('video_id')
4211 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4212 playlist_entries.append(self.url_result(talk_url, 'TED'))
4213 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
4215 def _talk_info(self, url, video_id=0):
4216 """Return the video for the talk in the url"""
4217 m=re.match(self._VALID_URL, url,re.VERBOSE)
4218 videoName=m.group('name')
4219 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4220 # If the url includes the language we get the title translated
4221 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4222 title=re.search(title_RE, webpage).group('title')
4223 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4224 "id":(?P<videoID>[\d]+).*?
4225 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4226 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4227 thumb_match=re.search(thumb_RE,webpage)
4228 info_match=re.search(info_RE,webpage,re.VERBOSE)
4229 video_id=info_match.group('videoID')
4230 mediaSlug=info_match.group('mediaSlug')
4231 video_url=self._talk_video_link(mediaSlug)
4237 'thumbnail': thumb_match.group('thumbnail')
4241 class MySpassIE(InfoExtractor):
4242 _VALID_URL = r'http://www.myspass.de/.*'
4244 def _real_extract(self, url):
4245 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4247 # video id is the last path element of the URL
4248 # usually there is a trailing slash, so also try the second but last
4249 url_path = compat_urllib_parse_urlparse(url).path
4250 url_parent_path, video_id = os.path.split(url_path)
4252 _, video_id = os.path.split(url_parent_path)
4255 metadata_url = META_DATA_URL_TEMPLATE % video_id
4256 metadata_text = self._download_webpage(metadata_url, video_id)
4257 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4259 # extract values from metadata
4260 url_flv_el = metadata.find('url_flv')
4261 if url_flv_el is None:
4262 self._downloader.report_error(u'unable to extract download url')
4264 video_url = url_flv_el.text
4265 extension = os.path.splitext(video_url)[1][1:]
4266 title_el = metadata.find('title')
4267 if title_el is None:
4268 self._downloader.report_error(u'unable to extract title')
4270 title = title_el.text
4271 format_id_el = metadata.find('format_id')
4272 if format_id_el is None:
4275 format = format_id_el.text
4276 description_el = metadata.find('description')
4277 if description_el is not None:
4278 description = description_el.text
4281 imagePreview_el = metadata.find('imagePreview')
4282 if imagePreview_el is not None:
4283 thumbnail = imagePreview_el.text
4292 'thumbnail': thumbnail,
4293 'description': description
4297 class SpiegelIE(InfoExtractor):
4298 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4300 def _real_extract(self, url):
4301 m = re.match(self._VALID_URL, url)
4302 video_id = m.group('videoID')
4304 webpage = self._download_webpage(url, video_id)
4305 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4307 raise ExtractorError(u'Cannot find title')
4308 video_title = unescapeHTML(m.group(1))
4310 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4311 xml_code = self._download_webpage(xml_url, video_id,
4312 note=u'Downloading XML', errnote=u'Failed to download XML')
4314 idoc = xml.etree.ElementTree.fromstring(xml_code)
4315 last_type = idoc[-1]
4316 filename = last_type.findall('./filename')[0].text
4317 duration = float(last_type.findall('./duration')[0].text)
4319 video_url = 'http://video2.spiegel.de/flash/' + filename
4320 video_ext = filename.rpartition('.')[2]
4325 'title': video_title,
4326 'duration': duration,
4330 class LiveLeakIE(InfoExtractor):
4332 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4333 IE_NAME = u'liveleak'
4335 def _real_extract(self, url):
4336 mobj = re.match(self._VALID_URL, url)
4338 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4341 video_id = mobj.group('video_id')
4343 webpage = self._download_webpage(url, video_id)
4345 m = re.search(r'file: "(.*?)",', webpage)
4347 self._downloader.report_error(u'unable to find video url')
4349 video_url = m.group(1)
4351 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4353 self._downloader.trouble(u'Cannot find video title')
4354 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4356 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4358 desc = unescapeHTML(m.group('desc'))
4362 m = re.search(r'By:.*?(\w+)</a>', webpage)
4364 uploader = clean_html(m.group(1))
4373 'description': desc,
4374 'uploader': uploader
4379 class ARDIE(InfoExtractor):
4380 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4381 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4382 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4384 def _real_extract(self, url):
4385 # determine video id from url
4386 m = re.match(self._VALID_URL, url)
4388 numid = re.search(r'documentId=([0-9]+)', url)
4390 video_id = numid.group(1)
4392 video_id = m.group('video_id')
4394 # determine title and media streams from webpage
4395 html = self._download_webpage(url, video_id)
4396 title = re.search(self._TITLE, html).group('title')
4397 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4399 assert '"fsk"' in html
4400 self._downloader.report_error(u'this video is only available after 8:00 pm')
4403 # choose default media type and highest quality for now
4404 stream = max([s for s in streams if int(s["media_type"]) == 0],
4405 key=lambda s: int(s["quality"]))
4407 # there's two possibilities: RTMP stream or HTTP download
4408 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4409 if stream['rtmp_url']:
4410 self._downloader.to_screen(u'[%s] RTMP download detected' % self.IE_NAME)
4411 assert stream['video_url'].startswith('mp4:')
4412 info["url"] = stream["rtmp_url"]
4413 info["play_path"] = stream['video_url']
4415 assert stream["video_url"].endswith('.mp4')
4416 info["url"] = stream["video_url"]
4420 def gen_extractors():
4421 """ Return a list of an instance of every supported extractor.
4422 The order does matter; the first extractor matched is the one handling the URL.
4425 YoutubePlaylistIE(),
4450 StanfordOpenClassroomIE(),
4460 WorldStarHipHopIE(),
4476 def get_info_extractor(ie_name):
4477 """Returns the info extractor class with the given ie_name"""
4478 return globals()[ie_name+'IE']