2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 note = u'Downloading video webpage'
118 if note is not False:
119 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
121 return compat_urllib_request.urlopen(url_or_request)
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
124 errnote = u'Unable to download webpage'
125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
127 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns the data of the page as a string """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
133 encoding = m.group(1)
136 webpage_bytes = urlh.read()
137 if self._downloader.params.get('dump_intermediate_pages', False):
139 url = url_or_request.get_full_url()
140 except AttributeError:
142 self._downloader.to_screen(u'Dumping request to ' + url)
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
145 return webpage_bytes.decode(encoding, 'replace')
147 #Methods for following #608
148 #They set the correct value of the '_type' key
149 def video_result(self, video_info):
150 """Returns a video"""
151 video_info['_type'] = 'video'
153 def url_result(self, url, ie=None):
154 """Returns a url that points to a page that should be processed"""
155 #TODO: ie should be the class used for getting the info
156 video_info = {'_type': 'url',
160 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
161 """Returns a playlist"""
162 video_info = {'_type': 'playlist',
165 video_info['id'] = playlist_id
167 video_info['title'] = playlist_title
171 class YoutubeIE(InfoExtractor):
172 """Information extractor for youtube.com."""
176 (?:https?://)? # http(s):// (optional)
177 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
178 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
179 (?:.*?\#/)? # handle anchor (#/) redirect urls
180 (?: # the various things that can precede the ID:
181 (?:(?:v|embed|e)/) # v/ or embed/ or e/
182 |(?: # or the v= param in all its forms
183 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
184 (?:\?|\#!?) # the params delimiter ? or # or #!
185 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
188 )? # optional -> youtube.com/xxxx is OK
189 )? # all until now is optional -> you can pass the naked ID
190 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
191 (?(1).+)? # if we found the ID, everything can follow
193 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
194 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
195 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
196 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
197 _NETRC_MACHINE = 'youtube'
198 # Listed in order of quality
199 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
200 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
201 _video_extensions = {
207 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
213 _video_dimensions = {
232 def suitable(cls, url):
233 """Receives a URL and returns True if suitable for this IE."""
234 if YoutubePlaylistIE.suitable(url): return False
235 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
237 def report_lang(self):
238 """Report attempt to set language."""
239 self._downloader.to_screen(u'[youtube] Setting language')
241 def report_login(self):
242 """Report attempt to log in."""
243 self._downloader.to_screen(u'[youtube] Logging in')
245 def report_age_confirmation(self):
246 """Report attempt to confirm age."""
247 self._downloader.to_screen(u'[youtube] Confirming age')
249 def report_video_webpage_download(self, video_id):
250 """Report attempt to download video webpage."""
251 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
253 def report_video_info_webpage_download(self, video_id):
254 """Report attempt to download video info webpage."""
255 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
257 def report_video_subtitles_download(self, video_id):
258 """Report attempt to download video info webpage."""
259 self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
261 def report_video_subtitles_request(self, video_id, sub_lang, format):
262 """Report attempt to download video info webpage."""
263 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
265 def report_video_subtitles_available(self, video_id, sub_lang_list):
266 """Report available subtitles."""
267 sub_lang = ",".join(list(sub_lang_list.keys()))
268 self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
270 def report_information_extraction(self, video_id):
271 """Report attempt to extract video information."""
272 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
274 def report_unavailable_format(self, video_id, format):
275 """Report extracted video URL."""
276 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
278 def report_rtmp_download(self):
279 """Indicate the download will use the RTMP protocol."""
280 self._downloader.to_screen(u'[youtube] RTMP download detected')
282 def _get_available_subtitles(self, video_id):
283 self.report_video_subtitles_download(video_id)
284 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
286 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
287 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
288 return (u'unable to download video subtitles: %s' % compat_str(err), None)
289 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
290 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
291 if not sub_lang_list:
292 return (u'video doesn\'t have subtitles', None)
295 def _list_available_subtitles(self, video_id):
296 sub_lang_list = self._get_available_subtitles(video_id)
297 self.report_video_subtitles_available(video_id, sub_lang_list)
299 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
302 (error_message, sub_lang, sub)
304 self.report_video_subtitles_request(video_id, sub_lang, format)
305 params = compat_urllib_parse.urlencode({
311 url = 'http://www.youtube.com/api/timedtext?' + params
313 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
314 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
315 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
317 return (u'Did not fetch video subtitles', None, None)
318 return (None, sub_lang, sub)
320 def _extract_subtitle(self, video_id):
322 Return a list with a tuple:
323 [(error_message, sub_lang, sub)]
325 sub_lang_list = self._get_available_subtitles(video_id)
326 sub_format = self._downloader.params.get('subtitlesformat')
327 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
328 return [(sub_lang_list[0], None, None)]
329 if self._downloader.params.get('subtitleslang', False):
330 sub_lang = self._downloader.params.get('subtitleslang')
331 elif 'en' in sub_lang_list:
334 sub_lang = list(sub_lang_list.keys())[0]
335 if not sub_lang in sub_lang_list:
336 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
338 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
341 def _extract_all_subtitles(self, video_id):
342 sub_lang_list = self._get_available_subtitles(video_id)
343 sub_format = self._downloader.params.get('subtitlesformat')
344 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
345 return [(sub_lang_list[0], None, None)]
347 for sub_lang in sub_lang_list:
348 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
349 subtitles.append(subtitle)
352 def _print_formats(self, formats):
353 print('Available formats:')
355 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
357 def _real_initialize(self):
358 if self._downloader is None:
363 downloader_params = self._downloader.params
365 # Attempt to use provided username and password or .netrc data
366 if downloader_params.get('username', None) is not None:
367 username = downloader_params['username']
368 password = downloader_params['password']
369 elif downloader_params.get('usenetrc', False):
371 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
376 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
377 except (IOError, netrc.NetrcParseError) as err:
378 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
382 request = compat_urllib_request.Request(self._LANG_URL)
385 compat_urllib_request.urlopen(request).read()
386 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
387 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
390 # No authentication to be performed
394 request = compat_urllib_request.Request(self._LOGIN_URL)
396 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
397 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
398 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
403 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
405 galx = match.group(1)
407 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
413 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
417 u'PersistentCookie': u'yes',
419 u'bgresponse': u'js_disabled',
420 u'checkConnection': u'',
421 u'checkedDomains': u'youtube',
427 u'signIn': u'Sign in',
429 u'service': u'youtube',
433 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
435 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
436 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
437 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
440 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
441 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
442 self._downloader.report_warning(u'unable to log in: bad username or password')
444 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
445 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
451 'action_confirm': 'Confirm',
453 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
455 self.report_age_confirmation()
456 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
457 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
458 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
461 def _extract_id(self, url):
462 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
464 self._downloader.report_error(u'invalid URL: %s' % url)
466 video_id = mobj.group(2)
469 def _real_extract(self, url):
470 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
471 mobj = re.search(self._NEXT_URL_RE, url)
473 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
474 video_id = self._extract_id(url)
477 self.report_video_webpage_download(video_id)
478 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
479 request = compat_urllib_request.Request(url)
481 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
482 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
483 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
486 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
488 # Attempt to extract SWF player URL
489 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
491 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
496 self.report_video_info_webpage_download(video_id)
497 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
498 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
499 % (video_id, el_type))
500 video_info_webpage = self._download_webpage(video_info_url, video_id,
502 errnote='unable to download video info webpage')
503 video_info = compat_parse_qs(video_info_webpage)
504 if 'token' in video_info:
506 if 'token' not in video_info:
507 if 'reason' in video_info:
508 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
510 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
513 # Check for "rental" videos
514 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
515 self._downloader.report_error(u'"rental" videos not supported')
518 # Start extracting information
519 self.report_information_extraction(video_id)
522 if 'author' not in video_info:
523 self._downloader.report_error(u'unable to extract uploader name')
525 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
528 video_uploader_id = None
529 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
531 video_uploader_id = mobj.group(1)
533 self._downloader.report_warning(u'unable to extract uploader nickname')
536 if 'title' not in video_info:
537 self._downloader.report_error(u'unable to extract video title')
539 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
542 if 'thumbnail_url' not in video_info:
543 self._downloader.report_warning(u'unable to extract video thumbnail')
545 else: # don't panic if we can't find it
546 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
550 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
552 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
553 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
554 for expression in format_expressions:
556 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
561 video_description = get_element_by_id("eow-description", video_webpage)
562 if video_description:
563 video_description = clean_html(video_description)
565 video_description = ''
568 video_subtitles = None
570 if self._downloader.params.get('writesubtitles', False):
571 video_subtitles = self._extract_subtitle(video_id)
573 (sub_error, sub_lang, sub) = video_subtitles[0]
575 self._downloader.report_error(sub_error)
577 if self._downloader.params.get('allsubtitles', False):
578 video_subtitles = self._extract_all_subtitles(video_id)
579 for video_subtitle in video_subtitles:
580 (sub_error, sub_lang, sub) = video_subtitle
582 self._downloader.report_error(sub_error)
584 if self._downloader.params.get('listsubtitles', False):
585 sub_lang_list = self._list_available_subtitles(video_id)
588 if 'length_seconds' not in video_info:
589 self._downloader.report_warning(u'unable to extract video duration')
592 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
595 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
597 # Decide which formats to download
598 req_format = self._downloader.params.get('format', None)
600 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
601 self.report_rtmp_download()
602 video_url_list = [(None, video_info['conn'][0])]
603 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
604 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
605 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
606 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
607 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
609 format_limit = self._downloader.params.get('format_limit', None)
610 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
611 if format_limit is not None and format_limit in available_formats:
612 format_list = available_formats[available_formats.index(format_limit):]
614 format_list = available_formats
615 existing_formats = [x for x in format_list if x in url_map]
616 if len(existing_formats) == 0:
617 self._downloader.report_error(u'no known formats available for video')
619 if self._downloader.params.get('listformats', None):
620 self._print_formats(existing_formats)
622 if req_format is None or req_format == 'best':
623 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
624 elif req_format == 'worst':
625 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
626 elif req_format in ('-1', 'all'):
627 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
629 # Specific formats. We pick the first in a slash-delimeted sequence.
630 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
631 req_formats = req_format.split('/')
632 video_url_list = None
633 for rf in req_formats:
635 video_url_list = [(rf, url_map[rf])]
637 if video_url_list is None:
638 self._downloader.report_error(u'requested format not available')
641 self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
645 for format_param, video_real_url in video_url_list:
647 video_extension = self._video_extensions.get(format_param, 'flv')
649 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
650 self._video_dimensions.get(format_param, '???'))
654 'url': video_real_url,
655 'uploader': video_uploader,
656 'uploader_id': video_uploader_id,
657 'upload_date': upload_date,
658 'title': video_title,
659 'ext': video_extension,
660 'format': video_format,
661 'thumbnail': video_thumbnail,
662 'description': video_description,
663 'player_url': player_url,
664 'subtitles': video_subtitles,
665 'duration': video_duration
670 class MetacafeIE(InfoExtractor):
671 """Information Extractor for metacafe.com."""
673 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
674 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
675 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
676 IE_NAME = u'metacafe'
678 def __init__(self, downloader=None):
679 InfoExtractor.__init__(self, downloader)
681 def report_disclaimer(self):
682 """Report disclaimer retrieval."""
683 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
685 def report_age_confirmation(self):
686 """Report attempt to confirm age."""
687 self._downloader.to_screen(u'[metacafe] Confirming age')
689 def report_download_webpage(self, video_id):
690 """Report webpage download."""
691 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
693 def report_extraction(self, video_id):
694 """Report information extraction."""
695 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
697 def _real_initialize(self):
698 # Retrieve disclaimer
699 request = compat_urllib_request.Request(self._DISCLAIMER)
701 self.report_disclaimer()
702 disclaimer = compat_urllib_request.urlopen(request).read()
703 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
704 self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
710 'submit': "Continue - I'm over 18",
712 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
714 self.report_age_confirmation()
715 disclaimer = compat_urllib_request.urlopen(request).read()
716 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
717 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
720 def _real_extract(self, url):
721 # Extract id and simplified title from URL
722 mobj = re.match(self._VALID_URL, url)
724 self._downloader.report_error(u'invalid URL: %s' % url)
727 video_id = mobj.group(1)
729 # Check if video comes from YouTube
730 mobj2 = re.match(r'^yt-(.*)$', video_id)
731 if mobj2 is not None:
732 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
734 # Retrieve video webpage to extract further information
735 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
737 # Extract URL, uploader and title from webpage
738 self.report_extraction(video_id)
739 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
741 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
742 video_extension = mediaURL[-3:]
744 # Extract gdaKey if available
745 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
749 gdaKey = mobj.group(1)
750 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
752 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
754 self._downloader.report_error(u'unable to extract media URL')
756 vardict = compat_parse_qs(mobj.group(1))
757 if 'mediaData' not in vardict:
758 self._downloader.report_error(u'unable to extract media URL')
760 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
762 self._downloader.report_error(u'unable to extract media URL')
764 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
765 video_extension = mediaURL[-3:]
766 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
768 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
770 self._downloader.report_error(u'unable to extract title')
772 video_title = mobj.group(1).decode('utf-8')
774 mobj = re.search(r'submitter=(.*?);', webpage)
776 self._downloader.report_error(u'unable to extract uploader nickname')
778 video_uploader = mobj.group(1)
781 'id': video_id.decode('utf-8'),
782 'url': video_url.decode('utf-8'),
783 'uploader': video_uploader.decode('utf-8'),
785 'title': video_title,
786 'ext': video_extension.decode('utf-8'),
790 class DailymotionIE(InfoExtractor):
791 """Information Extractor for Dailymotion"""
793 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
794 IE_NAME = u'dailymotion'
797 def __init__(self, downloader=None):
798 InfoExtractor.__init__(self, downloader)
800 def report_extraction(self, video_id):
801 """Report information extraction."""
802 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
804 def _real_extract(self, url):
805 # Extract id and simplified title from URL
806 mobj = re.match(self._VALID_URL, url)
808 self._downloader.report_error(u'invalid URL: %s' % url)
811 video_id = mobj.group(1).split('_')[0].split('?')[0]
813 video_extension = 'mp4'
815 # Retrieve video webpage to extract further information
816 request = compat_urllib_request.Request(url)
817 request.add_header('Cookie', 'family_filter=off')
818 webpage = self._download_webpage(request, video_id)
820 # Extract URL, uploader and title from webpage
821 self.report_extraction(video_id)
822 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
824 self._downloader.report_error(u'unable to extract media URL')
826 flashvars = compat_urllib_parse.unquote(mobj.group(1))
828 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
831 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
834 self._downloader.report_error(u'unable to extract video URL')
837 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
839 self._downloader.report_error(u'unable to extract video URL')
842 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
844 # TODO: support choosing qualities
846 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
848 self._downloader.report_error(u'unable to extract title')
850 video_title = unescapeHTML(mobj.group('title'))
852 video_uploader = None
853 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
855 # lookin for official user
856 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
857 if mobj_official is None:
858 self._downloader.report_warning(u'unable to extract uploader nickname')
860 video_uploader = mobj_official.group(1)
862 video_uploader = mobj.group(1)
864 video_upload_date = None
865 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
867 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
872 'uploader': video_uploader,
873 'upload_date': video_upload_date,
874 'title': video_title,
875 'ext': video_extension,
879 class PhotobucketIE(InfoExtractor):
880 """Information extractor for photobucket.com."""
882 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
883 IE_NAME = u'photobucket'
885 def __init__(self, downloader=None):
886 InfoExtractor.__init__(self, downloader)
888 def report_download_webpage(self, video_id):
889 """Report webpage download."""
890 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
892 def report_extraction(self, video_id):
893 """Report information extraction."""
894 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
896 def _real_extract(self, url):
897 # Extract id from URL
898 mobj = re.match(self._VALID_URL, url)
900 self._downloader.report_error(u'Invalid URL: %s' % url)
903 video_id = mobj.group(1)
905 video_extension = 'flv'
907 # Retrieve video webpage to extract further information
908 request = compat_urllib_request.Request(url)
910 self.report_download_webpage(video_id)
911 webpage = compat_urllib_request.urlopen(request).read()
912 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
913 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
916 # Extract URL, uploader, and title from webpage
917 self.report_extraction(video_id)
918 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
920 self._downloader.report_error(u'unable to extract media URL')
922 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
926 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
928 self._downloader.report_error(u'unable to extract title')
930 video_title = mobj.group(1).decode('utf-8')
932 video_uploader = mobj.group(2).decode('utf-8')
935 'id': video_id.decode('utf-8'),
936 'url': video_url.decode('utf-8'),
937 'uploader': video_uploader,
939 'title': video_title,
940 'ext': video_extension.decode('utf-8'),
944 class YahooIE(InfoExtractor):
945 """Information extractor for video.yahoo.com."""
948 # _VALID_URL matches all Yahoo! Video URLs
949 # _VPAGE_URL matches only the extractable '/watch/' URLs
950 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
951 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
952 IE_NAME = u'video.yahoo'
954 def __init__(self, downloader=None):
955 InfoExtractor.__init__(self, downloader)
957 def report_download_webpage(self, video_id):
958 """Report webpage download."""
959 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
961 def report_extraction(self, video_id):
962 """Report information extraction."""
963 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
965 def _real_extract(self, url, new_video=True):
966 # Extract ID from URL
967 mobj = re.match(self._VALID_URL, url)
969 self._downloader.report_error(u'Invalid URL: %s' % url)
972 video_id = mobj.group(2)
973 video_extension = 'flv'
975 # Rewrite valid but non-extractable URLs as
976 # extractable English language /watch/ URLs
977 if re.match(self._VPAGE_URL, url) is None:
978 request = compat_urllib_request.Request(url)
980 webpage = compat_urllib_request.urlopen(request).read()
981 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
982 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
985 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
987 self._downloader.report_error(u'Unable to extract id field')
989 yahoo_id = mobj.group(1)
991 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
993 self._downloader.report_error(u'Unable to extract vid field')
995 yahoo_vid = mobj.group(1)
997 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
998 return self._real_extract(url, new_video=False)
1000 # Retrieve video webpage to extract further information
1001 request = compat_urllib_request.Request(url)
1003 self.report_download_webpage(video_id)
1004 webpage = compat_urllib_request.urlopen(request).read()
1005 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1006 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1009 # Extract uploader and title from webpage
1010 self.report_extraction(video_id)
1011 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1013 self._downloader.report_error(u'unable to extract video title')
1015 video_title = mobj.group(1).decode('utf-8')
1017 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1019 self._downloader.report_error(u'unable to extract video uploader')
1021 video_uploader = mobj.group(1).decode('utf-8')
1023 # Extract video thumbnail
1024 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1026 self._downloader.report_error(u'unable to extract video thumbnail')
1028 video_thumbnail = mobj.group(1).decode('utf-8')
1030 # Extract video description
1031 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1033 self._downloader.report_error(u'unable to extract video description')
1035 video_description = mobj.group(1).decode('utf-8')
1036 if not video_description:
1037 video_description = 'No description available.'
1039 # Extract video height and width
1040 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1042 self._downloader.report_error(u'unable to extract video height')
1044 yv_video_height = mobj.group(1)
1046 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1048 self._downloader.report_error(u'unable to extract video width')
1050 yv_video_width = mobj.group(1)
1052 # Retrieve video playlist to extract media URL
1053 # I'm not completely sure what all these options are, but we
1054 # seem to need most of them, otherwise the server sends a 401.
1055 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1056 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1057 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1058 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1059 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1061 self.report_download_webpage(video_id)
1062 webpage = compat_urllib_request.urlopen(request).read()
1063 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1064 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1067 # Extract media URL from playlist XML
1068 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1070 self._downloader.report_error(u'Unable to extract media URL')
1072 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1073 video_url = unescapeHTML(video_url)
1076 'id': video_id.decode('utf-8'),
1078 'uploader': video_uploader,
1079 'upload_date': None,
1080 'title': video_title,
1081 'ext': video_extension.decode('utf-8'),
1082 'thumbnail': video_thumbnail.decode('utf-8'),
1083 'description': video_description,
1087 class VimeoIE(InfoExtractor):
1088 """Information extractor for vimeo.com."""
1090 # _VALID_URL matches Vimeo URLs
1091 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1094 def __init__(self, downloader=None):
1095 InfoExtractor.__init__(self, downloader)
1097 def report_download_webpage(self, video_id):
1098 """Report webpage download."""
1099 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1101 def report_extraction(self, video_id):
1102 """Report information extraction."""
1103 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1105 def _real_extract(self, url, new_video=True):
1106 # Extract ID from URL
1107 mobj = re.match(self._VALID_URL, url)
1109 self._downloader.report_error(u'Invalid URL: %s' % url)
1112 video_id = mobj.group('id')
1113 if not mobj.group('proto'):
1114 url = 'https://' + url
1115 if mobj.group('direct_link'):
1116 url = 'https://vimeo.com/' + video_id
1118 # Retrieve video webpage to extract further information
1119 request = compat_urllib_request.Request(url, None, std_headers)
1121 self.report_download_webpage(video_id)
1122 webpage_bytes = compat_urllib_request.urlopen(request).read()
1123 webpage = webpage_bytes.decode('utf-8')
1124 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1125 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1128 # Now we begin extracting as much information as we can from what we
1129 # retrieved. First we extract the information common to all extractors,
1130 # and latter we extract those that are Vimeo specific.
1131 self.report_extraction(video_id)
1133 # Extract the config JSON
1135 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1136 config = json.loads(config)
1138 self._downloader.report_error(u'unable to extract info section')
1142 video_title = config["video"]["title"]
1144 # Extract uploader and uploader_id
1145 video_uploader = config["video"]["owner"]["name"]
1146 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1148 # Extract video thumbnail
1149 video_thumbnail = config["video"]["thumbnail"]
1151 # Extract video description
1152 video_description = get_element_by_attribute("itemprop", "description", webpage)
1153 if video_description: video_description = clean_html(video_description)
1154 else: video_description = u''
1156 # Extract upload date
1157 video_upload_date = None
1158 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1159 if mobj is not None:
1160 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1162 # Vimeo specific: extract request signature and timestamp
1163 sig = config['request']['signature']
1164 timestamp = config['request']['timestamp']
1166 # Vimeo specific: extract video codec and quality information
1167 # First consider quality, then codecs, then take everything
1168 # TODO bind to format param
1169 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1170 files = { 'hd': [], 'sd': [], 'other': []}
1171 for codec_name, codec_extension in codecs:
1172 if codec_name in config["video"]["files"]:
1173 if 'hd' in config["video"]["files"][codec_name]:
1174 files['hd'].append((codec_name, codec_extension, 'hd'))
1175 elif 'sd' in config["video"]["files"][codec_name]:
1176 files['sd'].append((codec_name, codec_extension, 'sd'))
1178 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1180 for quality in ('hd', 'sd', 'other'):
1181 if len(files[quality]) > 0:
1182 video_quality = files[quality][0][2]
1183 video_codec = files[quality][0][0]
1184 video_extension = files[quality][0][1]
1185 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1188 self._downloader.report_error(u'no known codec found')
1191 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1192 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1197 'uploader': video_uploader,
1198 'uploader_id': video_uploader_id,
1199 'upload_date': video_upload_date,
1200 'title': video_title,
1201 'ext': video_extension,
1202 'thumbnail': video_thumbnail,
1203 'description': video_description,
1207 class ArteTvIE(InfoExtractor):
1208 """arte.tv information extractor."""
1210 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1211 _LIVE_URL = r'index-[0-9]+\.html$'
1213 IE_NAME = u'arte.tv'
1215 def __init__(self, downloader=None):
1216 InfoExtractor.__init__(self, downloader)
1218 def report_download_webpage(self, video_id):
1219 """Report webpage download."""
1220 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1222 def report_extraction(self, video_id):
1223 """Report information extraction."""
1224 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1226 def fetch_webpage(self, url):
1227 request = compat_urllib_request.Request(url)
1229 self.report_download_webpage(url)
1230 webpage = compat_urllib_request.urlopen(request).read()
1231 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1232 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1234 except ValueError as err:
1235 self._downloader.report_error(u'Invalid URL: %s' % url)
1239 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1240 page = self.fetch_webpage(url)
1241 mobj = re.search(regex, page, regexFlags)
1245 self._downloader.report_error(u'Invalid URL: %s' % url)
1248 for (i, key, err) in matchTuples:
1249 if mobj.group(i) is None:
1250 self._downloader.trouble(err)
1253 info[key] = mobj.group(i)
1257 def extractLiveStream(self, url):
1258 video_lang = url.split('/')[-4]
1259 info = self.grep_webpage(
1261 r'src="(.*?/videothek_js.*?\.js)',
1264 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1267 http_host = url.split('/')[2]
1268 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1269 info = self.grep_webpage(
1271 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1272 '(http://.*?\.swf).*?' +
1276 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1277 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1278 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1281 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1283 def extractPlus7Stream(self, url):
1284 video_lang = url.split('/')[-3]
1285 info = self.grep_webpage(
1287 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1290 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1293 next_url = compat_urllib_parse.unquote(info.get('url'))
1294 info = self.grep_webpage(
1296 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1299 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1302 next_url = compat_urllib_parse.unquote(info.get('url'))
1304 info = self.grep_webpage(
1306 r'<video id="(.*?)".*?>.*?' +
1307 '<name>(.*?)</name>.*?' +
1308 '<dateVideo>(.*?)</dateVideo>.*?' +
1309 '<url quality="hd">(.*?)</url>',
1312 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1313 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1314 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1315 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1320 'id': info.get('id'),
1321 'url': compat_urllib_parse.unquote(info.get('url')),
1322 'uploader': u'arte.tv',
1323 'upload_date': info.get('date'),
1324 'title': info.get('title').decode('utf-8'),
1330 def _real_extract(self, url):
1331 video_id = url.split('/')[-1]
1332 self.report_extraction(video_id)
1334 if re.search(self._LIVE_URL, video_id) is not None:
1335 self.extractLiveStream(url)
1338 info = self.extractPlus7Stream(url)
1343 class GenericIE(InfoExtractor):
1344 """Generic last-resort information extractor."""
1347 IE_NAME = u'generic'
1349 def __init__(self, downloader=None):
1350 InfoExtractor.__init__(self, downloader)
1352 def report_download_webpage(self, video_id):
1353 """Report webpage download."""
1354 if not self._downloader.params.get('test', False):
1355 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1356 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1358 def report_extraction(self, video_id):
1359 """Report information extraction."""
1360 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1362 def report_following_redirect(self, new_url):
1363 """Report information extraction."""
1364 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1366 def _test_redirect(self, url):
1367 """Check if it is a redirect, like url shorteners, in case return the new url."""
1368 class HeadRequest(compat_urllib_request.Request):
1369 def get_method(self):
1372 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1374 Subclass the HTTPRedirectHandler to make it use our
1375 HeadRequest also on the redirected URL
1377 def redirect_request(self, req, fp, code, msg, headers, newurl):
1378 if code in (301, 302, 303, 307):
1379 newurl = newurl.replace(' ', '%20')
1380 newheaders = dict((k,v) for k,v in req.headers.items()
1381 if k.lower() not in ("content-length", "content-type"))
1382 return HeadRequest(newurl,
1384 origin_req_host=req.get_origin_req_host(),
1387 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1389 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1391 Fallback to GET if HEAD is not allowed (405 HTTP error)
1393 def http_error_405(self, req, fp, code, msg, headers):
1397 newheaders = dict((k,v) for k,v in req.headers.items()
1398 if k.lower() not in ("content-length", "content-type"))
1399 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1401 origin_req_host=req.get_origin_req_host(),
1405 opener = compat_urllib_request.OpenerDirector()
1406 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1407 HTTPMethodFallback, HEADRedirectHandler,
1408 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1409 opener.add_handler(handler())
1411 response = opener.open(HeadRequest(url))
1412 new_url = response.geturl()
1417 self.report_following_redirect(new_url)
1420 def _real_extract(self, url):
1421 new_url = self._test_redirect(url)
1422 if new_url: return [self.url_result(new_url)]
1424 video_id = url.split('/')[-1]
1426 webpage = self._download_webpage(url, video_id)
1427 except ValueError as err:
1428 # since this is the last-resort InfoExtractor, if
1429 # this error is thrown, it'll be thrown here
1430 self._downloader.report_error(u'Invalid URL: %s' % url)
1433 self.report_extraction(video_id)
1434 # Start with something easy: JW Player in SWFObject
1435 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1437 # Broaden the search a little bit
1438 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1440 # Broaden the search a little bit: JWPlayer JS loader
1441 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1443 self._downloader.report_error(u'Invalid URL: %s' % url)
1446 # It's possible that one of the regexes
1447 # matched, but returned an empty group:
1448 if mobj.group(1) is None:
1449 self._downloader.report_error(u'Invalid URL: %s' % url)
1452 video_url = compat_urllib_parse.unquote(mobj.group(1))
1453 video_id = os.path.basename(video_url)
1455 # here's a fun little line of code for you:
1456 video_extension = os.path.splitext(video_id)[1][1:]
1457 video_id = os.path.splitext(video_id)[0]
1459 # it's tempting to parse this further, but you would
1460 # have to take into account all the variations like
1461 # Video Title - Site Name
1462 # Site Name | Video Title
1463 # Video Title - Tagline | Site Name
1464 # and so on and so forth; it's just not practical
1465 mobj = re.search(r'<title>(.*)</title>', webpage)
1467 self._downloader.report_error(u'unable to extract title')
1469 video_title = mobj.group(1)
1471 # video uploader is domain name
1472 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1474 self._downloader.report_error(u'unable to extract title')
1476 video_uploader = mobj.group(1)
1481 'uploader': video_uploader,
1482 'upload_date': None,
1483 'title': video_title,
1484 'ext': video_extension,
1488 class YoutubeSearchIE(InfoExtractor):
1489 """Information Extractor for YouTube search queries."""
1490 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1491 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1492 _max_youtube_results = 1000
1493 IE_NAME = u'youtube:search'
1495 def __init__(self, downloader=None):
1496 InfoExtractor.__init__(self, downloader)
1498 def report_download_page(self, query, pagenum):
1499 """Report attempt to download search page with given number."""
1500 query = query.decode(preferredencoding())
1501 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1503 def _real_extract(self, query):
1504 mobj = re.match(self._VALID_URL, query)
1506 self._downloader.report_error(u'invalid search query "%s"' % query)
1509 prefix, query = query.split(':')
1511 query = query.encode('utf-8')
1513 return self._get_n_results(query, 1)
1514 elif prefix == 'all':
1515 self._get_n_results(query, self._max_youtube_results)
1520 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1522 elif n > self._max_youtube_results:
1523 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1524 n = self._max_youtube_results
1525 return self._get_n_results(query, n)
1526 except ValueError: # parsing prefix as integer fails
1527 return self._get_n_results(query, 1)
1529 def _get_n_results(self, query, n):
1530 """Get a specified number of results for a query"""
1536 while (50 * pagenum) < limit:
1537 self.report_download_page(query, pagenum+1)
1538 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1539 request = compat_urllib_request.Request(result_url)
1541 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1542 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1543 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1545 api_response = json.loads(data)['data']
1547 if not 'items' in api_response:
1548 self._downloader.trouble(u'[youtube] No video results')
1551 new_ids = list(video['id'] for video in api_response['items'])
1552 video_ids += new_ids
1554 limit = min(n, api_response['totalItems'])
1557 if len(video_ids) > n:
1558 video_ids = video_ids[:n]
1559 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1563 class GoogleSearchIE(InfoExtractor):
1564 """Information Extractor for Google Video search queries."""
1565 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1566 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1567 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1568 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1569 _max_google_results = 1000
1570 IE_NAME = u'video.google:search'
1572 def __init__(self, downloader=None):
1573 InfoExtractor.__init__(self, downloader)
1575 def report_download_page(self, query, pagenum):
1576 """Report attempt to download playlist page with given number."""
1577 query = query.decode(preferredencoding())
1578 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1580 def _real_extract(self, query):
1581 mobj = re.match(self._VALID_URL, query)
1583 self._downloader.report_error(u'invalid search query "%s"' % query)
1586 prefix, query = query.split(':')
1588 query = query.encode('utf-8')
1590 self._download_n_results(query, 1)
1592 elif prefix == 'all':
1593 self._download_n_results(query, self._max_google_results)
1599 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1601 elif n > self._max_google_results:
1602 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1603 n = self._max_google_results
1604 self._download_n_results(query, n)
1606 except ValueError: # parsing prefix as integer fails
1607 self._download_n_results(query, 1)
1610 def _download_n_results(self, query, n):
1611 """Downloads a specified number of results for a query"""
1617 self.report_download_page(query, pagenum)
1618 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1619 request = compat_urllib_request.Request(result_url)
1621 page = compat_urllib_request.urlopen(request).read()
1622 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1623 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1626 # Extract video identifiers
1627 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1628 video_id = mobj.group(1)
1629 if video_id not in video_ids:
1630 video_ids.append(video_id)
1631 if len(video_ids) == n:
1632 # Specified n videos reached
1633 for id in video_ids:
1634 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1637 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1638 for id in video_ids:
1639 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1642 pagenum = pagenum + 1
1645 class YahooSearchIE(InfoExtractor):
1646 """Information Extractor for Yahoo! Video search queries."""
1649 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1650 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1651 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1652 _MORE_PAGES_INDICATOR = r'\s*Next'
1653 _max_yahoo_results = 1000
1654 IE_NAME = u'video.yahoo:search'
1656 def __init__(self, downloader=None):
1657 InfoExtractor.__init__(self, downloader)
1659 def report_download_page(self, query, pagenum):
1660 """Report attempt to download playlist page with given number."""
1661 query = query.decode(preferredencoding())
1662 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1664 def _real_extract(self, query):
1665 mobj = re.match(self._VALID_URL, query)
1667 self._downloader.report_error(u'invalid search query "%s"' % query)
1670 prefix, query = query.split(':')
1672 query = query.encode('utf-8')
1674 self._download_n_results(query, 1)
1676 elif prefix == 'all':
1677 self._download_n_results(query, self._max_yahoo_results)
1683 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1685 elif n > self._max_yahoo_results:
1686 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1687 n = self._max_yahoo_results
1688 self._download_n_results(query, n)
1690 except ValueError: # parsing prefix as integer fails
1691 self._download_n_results(query, 1)
1694 def _download_n_results(self, query, n):
1695 """Downloads a specified number of results for a query"""
1698 already_seen = set()
1702 self.report_download_page(query, pagenum)
1703 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1704 request = compat_urllib_request.Request(result_url)
1706 page = compat_urllib_request.urlopen(request).read()
1707 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1708 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1711 # Extract video identifiers
1712 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1713 video_id = mobj.group(1)
1714 if video_id not in already_seen:
1715 video_ids.append(video_id)
1716 already_seen.add(video_id)
1717 if len(video_ids) == n:
1718 # Specified n videos reached
1719 for id in video_ids:
1720 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1723 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1724 for id in video_ids:
1725 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1728 pagenum = pagenum + 1
1731 class YoutubePlaylistIE(InfoExtractor):
1732 """Information Extractor for YouTube playlists."""
1734 _VALID_URL = r"""(?:
1739 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1740 \? (?:.*?&)*? (?:p|a|list)=
1743 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1746 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1748 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1750 IE_NAME = u'youtube:playlist'
1752 def __init__(self, downloader=None):
1753 InfoExtractor.__init__(self, downloader)
1756 def suitable(cls, url):
1757 """Receives a URL and returns True if suitable for this IE."""
1758 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1760 def report_download_page(self, playlist_id, pagenum):
1761 """Report attempt to download playlist page with given number."""
1762 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1764 def _real_extract(self, url):
1765 # Extract playlist id
1766 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1768 self._downloader.report_error(u'invalid url: %s' % url)
1771 # Download playlist videos from API
1772 playlist_id = mobj.group(1) or mobj.group(2)
1777 self.report_download_page(playlist_id, page_num)
1779 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1781 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1782 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1783 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1787 response = json.loads(page)
1788 except ValueError as err:
1789 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1792 if 'feed' not in response:
1793 self._downloader.report_error(u'Got a malformed response from YouTube API')
1795 if 'entry' not in response['feed']:
1796 # Number of videos is a multiple of self._MAX_RESULTS
1799 playlist_title = response['feed']['title']['$t']
1801 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1802 for entry in response['feed']['entry']
1803 if 'content' in entry ]
1805 if len(response['feed']['entry']) < self._MAX_RESULTS:
1809 videos = [v[1] for v in sorted(videos)]
1811 url_results = [self.url_result(url, 'Youtube') for url in videos]
1812 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1815 class YoutubeChannelIE(InfoExtractor):
1816 """Information Extractor for YouTube channels."""
1818 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1819 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1820 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1821 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1822 IE_NAME = u'youtube:channel'
1824 def report_download_page(self, channel_id, pagenum):
1825 """Report attempt to download channel page with given number."""
1826 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1828 def extract_videos_from_page(self, page):
1830 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1831 if mobj.group(1) not in ids_in_page:
1832 ids_in_page.append(mobj.group(1))
1835 def _real_extract(self, url):
1836 # Extract channel id
1837 mobj = re.match(self._VALID_URL, url)
1839 self._downloader.report_error(u'invalid url: %s' % url)
1842 # Download channel page
1843 channel_id = mobj.group(1)
1847 self.report_download_page(channel_id, pagenum)
1848 url = self._TEMPLATE_URL % (channel_id, pagenum)
1849 request = compat_urllib_request.Request(url)
1851 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1852 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1853 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1856 # Extract video identifiers
1857 ids_in_page = self.extract_videos_from_page(page)
1858 video_ids.extend(ids_in_page)
1860 # Download any subsequent channel pages using the json-based channel_ajax query
1861 if self._MORE_PAGES_INDICATOR in page:
1863 pagenum = pagenum + 1
1865 self.report_download_page(channel_id, pagenum)
1866 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1867 request = compat_urllib_request.Request(url)
1869 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1870 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1871 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1874 page = json.loads(page)
1876 ids_in_page = self.extract_videos_from_page(page['content_html'])
1877 video_ids.extend(ids_in_page)
1879 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1882 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1884 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1885 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1886 return [self.playlist_result(url_entries, channel_id)]
1889 class YoutubeUserIE(InfoExtractor):
1890 """Information Extractor for YouTube users."""
1892 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1893 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1894 _GDATA_PAGE_SIZE = 50
1895 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1896 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1897 IE_NAME = u'youtube:user'
1899 def __init__(self, downloader=None):
1900 InfoExtractor.__init__(self, downloader)
1902 def report_download_page(self, username, start_index):
1903 """Report attempt to download user page."""
1904 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1905 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1907 def _real_extract(self, url):
1909 mobj = re.match(self._VALID_URL, url)
1911 self._downloader.report_error(u'invalid url: %s' % url)
1914 username = mobj.group(1)
1916 # Download video ids using YouTube Data API. Result size per
1917 # query is limited (currently to 50 videos) so we need to query
1918 # page by page until there are no video ids - it means we got
1925 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1926 self.report_download_page(username, start_index)
1928 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1931 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1932 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1933 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1936 # Extract video identifiers
1939 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1940 if mobj.group(1) not in ids_in_page:
1941 ids_in_page.append(mobj.group(1))
1943 video_ids.extend(ids_in_page)
1945 # A little optimization - if current page is not
1946 # "full", ie. does not contain PAGE_SIZE video ids then
1947 # we can assume that this page is the last one - there
1948 # are no more ids on further pages - no need to query
1951 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1956 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1957 url_results = [self.url_result(url, 'Youtube') for url in urls]
1958 return [self.playlist_result(url_results, playlist_title = username)]
1961 class BlipTVUserIE(InfoExtractor):
1962 """Information Extractor for blip.tv users."""
1964 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1966 IE_NAME = u'blip.tv:user'
1968 def __init__(self, downloader=None):
1969 InfoExtractor.__init__(self, downloader)
1971 def report_download_page(self, username, pagenum):
1972 """Report attempt to download user page."""
1973 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1974 (self.IE_NAME, username, pagenum))
1976 def _real_extract(self, url):
1978 mobj = re.match(self._VALID_URL, url)
1980 self._downloader.report_error(u'invalid url: %s' % url)
1983 username = mobj.group(1)
1985 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1987 request = compat_urllib_request.Request(url)
1990 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1991 mobj = re.search(r'data-users-id="([^"]+)"', page)
1992 page_base = page_base % mobj.group(1)
1993 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1994 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1998 # Download video ids using BlipTV Ajax calls. Result size per
1999 # query is limited (currently to 12 videos) so we need to query
2000 # page by page until there are no video ids - it means we got
2007 self.report_download_page(username, pagenum)
2008 url = page_base + "&page=" + str(pagenum)
2009 request = compat_urllib_request.Request( url )
2011 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
2012 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2013 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
2016 # Extract video identifiers
2019 for mobj in re.finditer(r'href="/([^"]+)"', page):
2020 if mobj.group(1) not in ids_in_page:
2021 ids_in_page.append(unescapeHTML(mobj.group(1)))
2023 video_ids.extend(ids_in_page)
2025 # A little optimization - if current page is not
2026 # "full", ie. does not contain PAGE_SIZE video ids then
2027 # we can assume that this page is the last one - there
2028 # are no more ids on further pages - no need to query
2031 if len(ids_in_page) < self._PAGE_SIZE:
2036 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
2037 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
2038 return [self.playlist_result(url_entries, playlist_title = username)]
2041 class DepositFilesIE(InfoExtractor):
2042 """Information extractor for depositfiles.com"""
2044 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2046 def report_download_webpage(self, file_id):
2047 """Report webpage download."""
2048 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2050 def report_extraction(self, file_id):
2051 """Report information extraction."""
2052 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2054 def _real_extract(self, url):
2055 file_id = url.split('/')[-1]
2056 # Rebuild url in english locale
2057 url = 'http://depositfiles.com/en/files/' + file_id
2059 # Retrieve file webpage with 'Free download' button pressed
2060 free_download_indication = { 'gateway_result' : '1' }
2061 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2063 self.report_download_webpage(file_id)
2064 webpage = compat_urllib_request.urlopen(request).read()
2065 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2066 self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2069 # Search for the real file URL
2070 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2071 if (mobj is None) or (mobj.group(1) is None):
2072 # Try to figure out reason of the error.
2073 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2074 if (mobj is not None) and (mobj.group(1) is not None):
2075 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2076 self._downloader.report_error(u'%s' % restriction_message)
2078 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2081 file_url = mobj.group(1)
2082 file_extension = os.path.splitext(file_url)[1][1:]
2084 # Search for file title
2085 mobj = re.search(r'<b title="(.*?)">', webpage)
2087 self._downloader.report_error(u'unable to extract title')
2089 file_title = mobj.group(1).decode('utf-8')
2092 'id': file_id.decode('utf-8'),
2093 'url': file_url.decode('utf-8'),
2095 'upload_date': None,
2096 'title': file_title,
2097 'ext': file_extension.decode('utf-8'),
2101 class FacebookIE(InfoExtractor):
2102 """Information Extractor for Facebook"""
2104 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2105 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2106 _NETRC_MACHINE = 'facebook'
2107 IE_NAME = u'facebook'
2109 def report_login(self):
2110 """Report attempt to log in."""
2111 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2113 def _real_initialize(self):
2114 if self._downloader is None:
2119 downloader_params = self._downloader.params
2121 # Attempt to use provided username and password or .netrc data
2122 if downloader_params.get('username', None) is not None:
2123 useremail = downloader_params['username']
2124 password = downloader_params['password']
2125 elif downloader_params.get('usenetrc', False):
2127 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2128 if info is not None:
2132 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2133 except (IOError, netrc.NetrcParseError) as err:
2134 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2137 if useremail is None:
2146 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2149 login_results = compat_urllib_request.urlopen(request).read()
2150 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2151 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2153 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2154 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2157 def _real_extract(self, url):
2158 mobj = re.match(self._VALID_URL, url)
2160 self._downloader.report_error(u'invalid URL: %s' % url)
2162 video_id = mobj.group('ID')
2164 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2165 webpage = self._download_webpage(url, video_id)
2167 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2168 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2169 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2171 raise ExtractorError(u'Cannot parse data')
2172 data = dict(json.loads(m.group(1)))
2173 params_raw = compat_urllib_parse.unquote(data['params'])
2174 params = json.loads(params_raw)
2175 video_data = params['video_data'][0]
2176 video_url = video_data.get('hd_src')
2178 video_url = video_data['sd_src']
2180 raise ExtractorError(u'Cannot find video URL')
2181 video_duration = int(video_data['video_duration'])
2182 thumbnail = video_data['thumbnail_src']
2184 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2186 raise ExtractorError(u'Cannot find title in webpage')
2187 video_title = unescapeHTML(m.group(1))
2191 'title': video_title,
2194 'duration': video_duration,
2195 'thumbnail': thumbnail,
2200 class BlipTVIE(InfoExtractor):
2201 """Information extractor for blip.tv"""
2203 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2204 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2205 IE_NAME = u'blip.tv'
2207 def report_extraction(self, file_id):
2208 """Report information extraction."""
2209 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2211 def report_direct_download(self, title):
2212 """Report information extraction."""
2213 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2215 def _real_extract(self, url):
2216 mobj = re.match(self._VALID_URL, url)
2218 self._downloader.report_error(u'invalid URL: %s' % url)
2221 urlp = compat_urllib_parse_urlparse(url)
2222 if urlp.path.startswith('/play/'):
2223 request = compat_urllib_request.Request(url)
2224 response = compat_urllib_request.urlopen(request)
2225 redirecturl = response.geturl()
2226 rurlp = compat_urllib_parse_urlparse(redirecturl)
2227 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2228 url = 'http://blip.tv/a/a-' + file_id
2229 return self._real_extract(url)
2236 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2237 request = compat_urllib_request.Request(json_url)
2238 request.add_header('User-Agent', 'iTunes/10.6.1')
2239 self.report_extraction(mobj.group(1))
2242 urlh = compat_urllib_request.urlopen(request)
2243 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2244 basename = url.split('/')[-1]
2245 title,ext = os.path.splitext(basename)
2246 title = title.decode('UTF-8')
2247 ext = ext.replace('.', '')
2248 self.report_direct_download(title)
2253 'upload_date': None,
2258 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2259 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2260 if info is None: # Regular URL
2262 json_code_bytes = urlh.read()
2263 json_code = json_code_bytes.decode('utf-8')
2264 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2265 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2269 json_data = json.loads(json_code)
2270 if 'Post' in json_data:
2271 data = json_data['Post']
2275 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2276 video_url = data['media']['url']
2277 umobj = re.match(self._URL_EXT, video_url)
2279 raise ValueError('Can not determine filename extension')
2280 ext = umobj.group(1)
2283 'id': data['item_id'],
2285 'uploader': data['display_name'],
2286 'upload_date': upload_date,
2287 'title': data['title'],
2289 'format': data['media']['mimeType'],
2290 'thumbnail': data['thumbnailUrl'],
2291 'description': data['description'],
2292 'player_url': data['embedUrl'],
2293 'user_agent': 'iTunes/10.6.1',
2295 except (ValueError,KeyError) as err:
2296 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2302 class MyVideoIE(InfoExtractor):
2303 """Information Extractor for myvideo.de."""
2305 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2306 IE_NAME = u'myvideo'
2308 def __init__(self, downloader=None):
2309 InfoExtractor.__init__(self, downloader)
2311 def report_extraction(self, video_id):
2312 """Report information extraction."""
2313 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2315 def _real_extract(self,url):
2316 mobj = re.match(self._VALID_URL, url)
2318 self._download.report_error(u'invalid URL: %s' % url)
2321 video_id = mobj.group(1)
2324 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2325 webpage = self._download_webpage(webpage_url, video_id)
2327 self.report_extraction(video_id)
2328 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2331 self._downloader.report_error(u'unable to extract media URL')
2333 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2335 mobj = re.search('<title>([^<]+)</title>', webpage)
2337 self._downloader.report_error(u'unable to extract title')
2340 video_title = mobj.group(1)
2346 'upload_date': None,
2347 'title': video_title,
2351 class ComedyCentralIE(InfoExtractor):
2352 """Information extractor for The Daily Show and Colbert Report """
2354 # urls can be abbreviations like :thedailyshow or :colbert
2355 # urls for episodes like:
2356 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2357 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2358 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2359 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2360 |(https?://)?(www\.)?
2361 (?P<showname>thedailyshow|colbertnation)\.com/
2362 (full-episodes/(?P<episode>.*)|
2364 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2365 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2368 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2370 _video_extensions = {
2378 _video_dimensions = {
2388 def suitable(cls, url):
2389 """Receives a URL and returns True if suitable for this IE."""
2390 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2392 def report_extraction(self, episode_id):
2393 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2395 def report_config_download(self, episode_id, media_id):
2396 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2398 def report_index_download(self, episode_id):
2399 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2401 def _print_formats(self, formats):
2402 print('Available formats:')
2404 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2407 def _real_extract(self, url):
2408 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2410 self._downloader.report_error(u'invalid URL: %s' % url)
2413 if mobj.group('shortname'):
2414 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2415 url = u'http://www.thedailyshow.com/full-episodes/'
2417 url = u'http://www.colbertnation.com/full-episodes/'
2418 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2419 assert mobj is not None
2421 if mobj.group('clip'):
2422 if mobj.group('showname') == 'thedailyshow':
2423 epTitle = mobj.group('tdstitle')
2425 epTitle = mobj.group('cntitle')
2428 dlNewest = not mobj.group('episode')
2430 epTitle = mobj.group('showname')
2432 epTitle = mobj.group('episode')
2434 req = compat_urllib_request.Request(url)
2435 self.report_extraction(epTitle)
2437 htmlHandle = compat_urllib_request.urlopen(req)
2438 html = htmlHandle.read()
2439 webpage = html.decode('utf-8')
2440 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2441 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2444 url = htmlHandle.geturl()
2445 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2447 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2449 if mobj.group('episode') == '':
2450 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2452 epTitle = mobj.group('episode')
2454 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2456 if len(mMovieParams) == 0:
2457 # The Colbert Report embeds the information in a without
2458 # a URL prefix; so extract the alternate reference
2459 # and then add the URL prefix manually.
2461 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2462 if len(altMovieParams) == 0:
2463 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2466 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2468 uri = mMovieParams[0][1]
2469 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2470 self.report_index_download(epTitle)
2472 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2473 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2474 self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2479 idoc = xml.etree.ElementTree.fromstring(indexXml)
2480 itemEls = idoc.findall('.//item')
2481 for partNum,itemEl in enumerate(itemEls):
2482 mediaId = itemEl.findall('./guid')[0].text
2483 shortMediaId = mediaId.split(':')[-1]
2484 showId = mediaId.split(':')[-2].replace('.com', '')
2485 officialTitle = itemEl.findall('./title')[0].text
2486 officialDate = itemEl.findall('./pubDate')[0].text
2488 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2489 compat_urllib_parse.urlencode({'uri': mediaId}))
2490 configReq = compat_urllib_request.Request(configUrl)
2491 self.report_config_download(epTitle, shortMediaId)
2493 configXml = compat_urllib_request.urlopen(configReq).read()
2494 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2495 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2498 cdoc = xml.etree.ElementTree.fromstring(configXml)
2500 for rendition in cdoc.findall('.//rendition'):
2501 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2505 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2508 if self._downloader.params.get('listformats', None):
2509 self._print_formats([i[0] for i in turls])
2512 # For now, just pick the highest bitrate
2513 format,rtmp_video_url = turls[-1]
2515 # Get the format arg from the arg stream
2516 req_format = self._downloader.params.get('format', None)
2518 # Select format if we can find one
2521 format, rtmp_video_url = f, v
2524 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2526 raise ExtractorError(u'Cannot transform RTMP url')
2527 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2528 video_url = base + m.group('finalid')
2530 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2535 'upload_date': officialDate,
2540 'description': officialTitle,
2542 results.append(info)
2547 class EscapistIE(InfoExtractor):
2548 """Information extractor for The Escapist """
2550 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2551 IE_NAME = u'escapist'
2553 def report_extraction(self, showName):
2554 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2556 def report_config_download(self, showName):
2557 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2559 def _real_extract(self, url):
2560 mobj = re.match(self._VALID_URL, url)
2562 self._downloader.report_error(u'invalid URL: %s' % url)
2564 showName = mobj.group('showname')
2565 videoId = mobj.group('episode')
2567 self.report_extraction(showName)
2569 webPage = compat_urllib_request.urlopen(url)
2570 webPageBytes = webPage.read()
2571 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2572 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2573 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2574 self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2577 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2578 description = unescapeHTML(descMatch.group(1))
2579 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2580 imgUrl = unescapeHTML(imgMatch.group(1))
2581 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2582 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2583 configUrlMatch = re.search('config=(.*)$', playerUrl)
2584 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2586 self.report_config_download(showName)
2588 configJSON = compat_urllib_request.urlopen(configUrl)
2589 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2590 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2591 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2592 self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2595 # Technically, it's JavaScript, not JSON
2596 configJSON = configJSON.replace("'", '"')
2599 config = json.loads(configJSON)
2600 except (ValueError,) as err:
2601 self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2604 playlist = config['playlist']
2605 videoUrl = playlist[1]['url']
2610 'uploader': showName,
2611 'upload_date': None,
2614 'thumbnail': imgUrl,
2615 'description': description,
2616 'player_url': playerUrl,
2621 class CollegeHumorIE(InfoExtractor):
2622 """Information extractor for collegehumor.com"""
2625 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2626 IE_NAME = u'collegehumor'
2628 def report_manifest(self, video_id):
2629 """Report information extraction."""
2630 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2632 def report_extraction(self, video_id):
2633 """Report information extraction."""
2634 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2636 def _real_extract(self, url):
2637 mobj = re.match(self._VALID_URL, url)
2639 self._downloader.report_error(u'invalid URL: %s' % url)
2641 video_id = mobj.group('videoid')
2646 'upload_date': None,
2649 self.report_extraction(video_id)
2650 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2652 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2653 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2654 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2657 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2659 videoNode = mdoc.findall('./video')[0]
2660 info['description'] = videoNode.findall('./description')[0].text
2661 info['title'] = videoNode.findall('./caption')[0].text
2662 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2663 manifest_url = videoNode.findall('./file')[0].text
2665 self._downloader.report_error(u'Invalid metadata XML file')
2668 manifest_url += '?hdcore=2.10.3'
2669 self.report_manifest(video_id)
2671 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2672 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2673 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2676 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2678 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2679 node_id = media_node.attrib['url']
2680 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2681 except IndexError as err:
2682 self._downloader.report_error(u'Invalid manifest file')
2685 url_pr = compat_urllib_parse_urlparse(manifest_url)
2686 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2693 class XVideosIE(InfoExtractor):
2694 """Information extractor for xvideos.com"""
2696 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2697 IE_NAME = u'xvideos'
2699 def report_extraction(self, video_id):
2700 """Report information extraction."""
2701 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2703 def _real_extract(self, url):
2704 mobj = re.match(self._VALID_URL, url)
2706 self._downloader.report_error(u'invalid URL: %s' % url)
2708 video_id = mobj.group(1)
2710 webpage = self._download_webpage(url, video_id)
2712 self.report_extraction(video_id)
2716 mobj = re.search(r'flv_url=(.+?)&', webpage)
2718 self._downloader.report_error(u'unable to extract video url')
2720 video_url = compat_urllib_parse.unquote(mobj.group(1))
2724 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2726 self._downloader.report_error(u'unable to extract video title')
2728 video_title = mobj.group(1)
2731 # Extract video thumbnail
2732 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2734 self._downloader.report_error(u'unable to extract video thumbnail')
2736 video_thumbnail = mobj.group(0)
2742 'upload_date': None,
2743 'title': video_title,
2745 'thumbnail': video_thumbnail,
2746 'description': None,
2752 class SoundcloudIE(InfoExtractor):
2753 """Information extractor for soundcloud.com
2754 To access the media, the uid of the song and a stream token
2755 must be extracted from the page source and the script must make
2756 a request to media.soundcloud.com/crossdomain.xml. Then
2757 the media can be grabbed by requesting from an url composed
2758 of the stream token and uid
2761 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2762 IE_NAME = u'soundcloud'
2764 def __init__(self, downloader=None):
2765 InfoExtractor.__init__(self, downloader)
2767 def report_resolve(self, video_id):
2768 """Report information extraction."""
2769 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2771 def report_extraction(self, video_id):
2772 """Report information extraction."""
2773 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2775 def _real_extract(self, url):
2776 mobj = re.match(self._VALID_URL, url)
2778 self._downloader.report_error(u'invalid URL: %s' % url)
2781 # extract uploader (which is in the url)
2782 uploader = mobj.group(1)
2783 # extract simple title (uploader + slug of song title)
2784 slug_title = mobj.group(2)
2785 simple_title = uploader + u'-' + slug_title
2787 self.report_resolve('%s/%s' % (uploader, slug_title))
2789 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2790 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2791 request = compat_urllib_request.Request(resolv_url)
2793 info_json_bytes = compat_urllib_request.urlopen(request).read()
2794 info_json = info_json_bytes.decode('utf-8')
2795 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2796 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2799 info = json.loads(info_json)
2800 video_id = info['id']
2801 self.report_extraction('%s/%s' % (uploader, slug_title))
2803 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2804 request = compat_urllib_request.Request(streams_url)
2806 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2807 stream_json = stream_json_bytes.decode('utf-8')
2808 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2809 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2812 streams = json.loads(stream_json)
2813 mediaURL = streams['http_mp3_128_url']
2818 'uploader': info['user']['username'],
2819 'upload_date': info['created_at'],
2820 'title': info['title'],
2822 'description': info['description'],
2825 class SoundcloudSetIE(InfoExtractor):
2826 """Information extractor for soundcloud.com sets
2827 To access the media, the uid of the song and a stream token
2828 must be extracted from the page source and the script must make
2829 a request to media.soundcloud.com/crossdomain.xml. Then
2830 the media can be grabbed by requesting from an url composed
2831 of the stream token and uid
2834 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2835 IE_NAME = u'soundcloud'
2837 def __init__(self, downloader=None):
2838 InfoExtractor.__init__(self, downloader)
2840 def report_resolve(self, video_id):
2841 """Report information extraction."""
2842 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2844 def report_extraction(self, video_id):
2845 """Report information extraction."""
2846 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2848 def _real_extract(self, url):
2849 mobj = re.match(self._VALID_URL, url)
2851 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2854 # extract uploader (which is in the url)
2855 uploader = mobj.group(1)
2856 # extract simple title (uploader + slug of song title)
2857 slug_title = mobj.group(2)
2858 simple_title = uploader + u'-' + slug_title
2860 self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2862 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2863 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2864 request = compat_urllib_request.Request(resolv_url)
2866 info_json_bytes = compat_urllib_request.urlopen(request).read()
2867 info_json = info_json_bytes.decode('utf-8')
2868 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2869 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2873 info = json.loads(info_json)
2874 if 'errors' in info:
2875 for err in info['errors']:
2876 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2879 for track in info['tracks']:
2880 video_id = track['id']
2881 self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2883 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2884 request = compat_urllib_request.Request(streams_url)
2886 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2887 stream_json = stream_json_bytes.decode('utf-8')
2888 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2889 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2892 streams = json.loads(stream_json)
2893 mediaURL = streams['http_mp3_128_url']
2898 'uploader': track['user']['username'],
2899 'upload_date': track['created_at'],
2900 'title': track['title'],
2902 'description': track['description'],
2907 class InfoQIE(InfoExtractor):
2908 """Information extractor for infoq.com"""
2909 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2911 def report_extraction(self, video_id):
2912 """Report information extraction."""
2913 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2915 def _real_extract(self, url):
2916 mobj = re.match(self._VALID_URL, url)
2918 self._downloader.report_error(u'invalid URL: %s' % url)
2921 webpage = self._download_webpage(url, video_id=url)
2922 self.report_extraction(url)
2925 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2927 self._downloader.report_error(u'unable to extract video url')
2929 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2930 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2933 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2935 self._downloader.report_error(u'unable to extract video title')
2937 video_title = mobj.group(1)
2939 # Extract description
2940 video_description = u'No description available.'
2941 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2942 if mobj is not None:
2943 video_description = mobj.group(1)
2945 video_filename = video_url.split('/')[-1]
2946 video_id, extension = video_filename.split('.')
2952 'upload_date': None,
2953 'title': video_title,
2954 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2956 'description': video_description,
2961 class MixcloudIE(InfoExtractor):
2962 """Information extractor for www.mixcloud.com"""
2964 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2965 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2966 IE_NAME = u'mixcloud'
2968 def __init__(self, downloader=None):
2969 InfoExtractor.__init__(self, downloader)
2971 def report_download_json(self, file_id):
2972 """Report JSON download."""
2973 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2975 def report_extraction(self, file_id):
2976 """Report information extraction."""
2977 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2979 def get_urls(self, jsonData, fmt, bitrate='best'):
2980 """Get urls from 'audio_formats' section in json"""
2983 bitrate_list = jsonData[fmt]
2984 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2985 bitrate = max(bitrate_list) # select highest
2987 url_list = jsonData[fmt][bitrate]
2988 except TypeError: # we have no bitrate info.
2989 url_list = jsonData[fmt]
2992 def check_urls(self, url_list):
2993 """Returns 1st active url from list"""
2994 for url in url_list:
2996 compat_urllib_request.urlopen(url)
2998 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3003 def _print_formats(self, formats):
3004 print('Available formats:')
3005 for fmt in formats.keys():
3006 for b in formats[fmt]:
3008 ext = formats[fmt][b][0]
3009 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
3010 except TypeError: # we have no bitrate info
3011 ext = formats[fmt][0]
3012 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3015 def _real_extract(self, url):
3016 mobj = re.match(self._VALID_URL, url)
3018 self._downloader.report_error(u'invalid URL: %s' % url)
3020 # extract uploader & filename from url
3021 uploader = mobj.group(1).decode('utf-8')
3022 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3024 # construct API request
3025 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3026 # retrieve .json file with links to files
3027 request = compat_urllib_request.Request(file_url)
3029 self.report_download_json(file_url)
3030 jsonData = compat_urllib_request.urlopen(request).read()
3031 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3032 self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
3036 json_data = json.loads(jsonData)
3037 player_url = json_data['player_swf_url']
3038 formats = dict(json_data['audio_formats'])
3040 req_format = self._downloader.params.get('format', None)
3043 if self._downloader.params.get('listformats', None):
3044 self._print_formats(formats)
3047 if req_format is None or req_format == 'best':
3048 for format_param in formats.keys():
3049 url_list = self.get_urls(formats, format_param)
3051 file_url = self.check_urls(url_list)
3052 if file_url is not None:
3055 if req_format not in formats:
3056 self._downloader.report_error(u'format is not available')
3059 url_list = self.get_urls(formats, req_format)
3060 file_url = self.check_urls(url_list)
3061 format_param = req_format
3064 'id': file_id.decode('utf-8'),
3065 'url': file_url.decode('utf-8'),
3066 'uploader': uploader.decode('utf-8'),
3067 'upload_date': None,
3068 'title': json_data['name'],
3069 'ext': file_url.split('.')[-1].decode('utf-8'),
3070 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3071 'thumbnail': json_data['thumbnail_url'],
3072 'description': json_data['description'],
3073 'player_url': player_url.decode('utf-8'),
3076 class StanfordOpenClassroomIE(InfoExtractor):
3077 """Information extractor for Stanford's Open ClassRoom"""
3079 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3080 IE_NAME = u'stanfordoc'
3082 def report_download_webpage(self, objid):
3083 """Report information extraction."""
3084 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3086 def report_extraction(self, video_id):
3087 """Report information extraction."""
3088 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3090 def _real_extract(self, url):
3091 mobj = re.match(self._VALID_URL, url)
3093 raise ExtractorError(u'Invalid URL: %s' % url)
3095 if mobj.group('course') and mobj.group('video'): # A specific video
3096 course = mobj.group('course')
3097 video = mobj.group('video')
3099 'id': course + '_' + video,
3101 'upload_date': None,
3104 self.report_extraction(info['id'])
3105 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3106 xmlUrl = baseUrl + video + '.xml'
3108 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3109 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3110 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3112 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3114 info['title'] = mdoc.findall('./title')[0].text
3115 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3117 self._downloader.report_error(u'Invalid metadata XML file')
3119 info['ext'] = info['url'].rpartition('.')[2]
3121 elif mobj.group('course'): # A course page
3122 course = mobj.group('course')
3127 'upload_date': None,
3130 coursepage = self._download_webpage(url, info['id'],
3131 note='Downloading course info page',
3132 errnote='Unable to download course info page')
3134 m = re.search('<h1>([^<]+)</h1>', coursepage)
3136 info['title'] = unescapeHTML(m.group(1))
3138 info['title'] = info['id']
3140 m = re.search('<description>([^<]+)</description>', coursepage)
3142 info['description'] = unescapeHTML(m.group(1))
3144 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3147 'type': 'reference',
3148 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3152 for entry in info['list']:
3153 assert entry['type'] == 'reference'
3154 results += self.extract(entry['url'])
3158 'id': 'Stanford OpenClassroom',
3161 'upload_date': None,
3164 self.report_download_webpage(info['id'])
3165 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3167 rootpage = compat_urllib_request.urlopen(rootURL).read()
3168 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3169 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3172 info['title'] = info['id']
3174 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3177 'type': 'reference',
3178 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3183 for entry in info['list']:
3184 assert entry['type'] == 'reference'
3185 results += self.extract(entry['url'])
3188 class MTVIE(InfoExtractor):
3189 """Information extractor for MTV.com"""
3191 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3194 def report_extraction(self, video_id):
3195 """Report information extraction."""
3196 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3198 def _real_extract(self, url):
3199 mobj = re.match(self._VALID_URL, url)
3201 self._downloader.report_error(u'invalid URL: %s' % url)
3203 if not mobj.group('proto'):
3204 url = 'http://' + url
3205 video_id = mobj.group('videoid')
3207 webpage = self._download_webpage(url, video_id)
3209 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3211 self._downloader.report_error(u'unable to extract song name')
3213 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3214 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3216 self._downloader.report_error(u'unable to extract performer')
3218 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3219 video_title = performer + ' - ' + song_name
3221 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3223 self._downloader.report_error(u'unable to mtvn_uri')
3225 mtvn_uri = mobj.group(1)
3227 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3229 self._downloader.report_error(u'unable to extract content id')
3231 content_id = mobj.group(1)
3233 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3234 self.report_extraction(video_id)
3235 request = compat_urllib_request.Request(videogen_url)
3237 metadataXml = compat_urllib_request.urlopen(request).read()
3238 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3239 self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3242 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3243 renditions = mdoc.findall('.//rendition')
3245 # For now, always pick the highest quality.
3246 rendition = renditions[-1]
3249 _,_,ext = rendition.attrib['type'].partition('/')
3250 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3251 video_url = rendition.find('./src').text
3253 self._downloader.trouble('Invalid rendition field.')
3259 'uploader': performer,
3260 'upload_date': None,
3261 'title': video_title,
3269 class YoukuIE(InfoExtractor):
3270 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3272 def report_download_webpage(self, file_id):
3273 """Report webpage download."""
3274 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3276 def report_extraction(self, file_id):
3277 """Report information extraction."""
3278 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3281 nowTime = int(time.time() * 1000)
3282 random1 = random.randint(1000,1998)
3283 random2 = random.randint(1000,9999)
3285 return "%d%d%d" %(nowTime,random1,random2)
3287 def _get_file_ID_mix_string(self, seed):
3289 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3291 for i in range(len(source)):
3292 seed = (seed * 211 + 30031 ) % 65536
3293 index = math.floor(seed / 65536 * len(source) )
3294 mixed.append(source[int(index)])
3295 source.remove(source[int(index)])
3296 #return ''.join(mixed)
3299 def _get_file_id(self, fileId, seed):
3300 mixed = self._get_file_ID_mix_string(seed)
3301 ids = fileId.split('*')
3305 realId.append(mixed[int(ch)])
3306 return ''.join(realId)
3308 def _real_extract(self, url):
3309 mobj = re.match(self._VALID_URL, url)
3311 self._downloader.report_error(u'invalid URL: %s' % url)
3313 video_id = mobj.group('ID')
3315 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3317 request = compat_urllib_request.Request(info_url, None, std_headers)
3319 self.report_download_webpage(video_id)
3320 jsondata = compat_urllib_request.urlopen(request).read()
3321 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3322 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3325 self.report_extraction(video_id)
3327 jsonstr = jsondata.decode('utf-8')
3328 config = json.loads(jsonstr)
3330 video_title = config['data'][0]['title']
3331 seed = config['data'][0]['seed']
3333 format = self._downloader.params.get('format', None)
3334 supported_format = list(config['data'][0]['streamfileids'].keys())
3336 if format is None or format == 'best':
3337 if 'hd2' in supported_format:
3342 elif format == 'worst':
3350 fileid = config['data'][0]['streamfileids'][format]
3351 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3352 except (UnicodeDecodeError, ValueError, KeyError):
3353 self._downloader.report_error(u'unable to extract info section')
3357 sid = self._gen_sid()
3358 fileid = self._get_file_id(fileid, seed)
3360 #column 8,9 of fileid represent the segment number
3361 #fileid[7:9] should be changed
3362 for index, key in enumerate(keys):
3364 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3365 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3368 'id': '%s_part%02d' % (video_id, index),
3369 'url': download_url,
3371 'upload_date': None,
3372 'title': video_title,
3375 files_info.append(info)
3380 class XNXXIE(InfoExtractor):
3381 """Information extractor for xnxx.com"""
3383 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3385 VIDEO_URL_RE = r'flv_url=(.*?)&'
3386 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3387 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3389 def report_webpage(self, video_id):
3390 """Report information extraction"""
3391 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3393 def report_extraction(self, video_id):
3394 """Report information extraction"""
3395 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3397 def _real_extract(self, url):
3398 mobj = re.match(self._VALID_URL, url)
3400 self._downloader.report_error(u'invalid URL: %s' % url)
3402 video_id = mobj.group(1)
3404 self.report_webpage(video_id)
3406 # Get webpage content
3408 webpage_bytes = compat_urllib_request.urlopen(url).read()
3409 webpage = webpage_bytes.decode('utf-8')
3410 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3411 self._downloader.report_error(u'unable to download video webpage: %s' % err)
3414 result = re.search(self.VIDEO_URL_RE, webpage)
3416 self._downloader.report_error(u'unable to extract video url')
3418 video_url = compat_urllib_parse.unquote(result.group(1))
3420 result = re.search(self.VIDEO_TITLE_RE, webpage)
3422 self._downloader.report_error(u'unable to extract video title')
3424 video_title = result.group(1)
3426 result = re.search(self.VIDEO_THUMB_RE, webpage)
3428 self._downloader.report_error(u'unable to extract video thumbnail')
3430 video_thumbnail = result.group(1)
3436 'upload_date': None,
3437 'title': video_title,
3439 'thumbnail': video_thumbnail,
3440 'description': None,
3444 class GooglePlusIE(InfoExtractor):
3445 """Information extractor for plus.google.com."""
3447 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3448 IE_NAME = u'plus.google'
3450 def __init__(self, downloader=None):
3451 InfoExtractor.__init__(self, downloader)
3453 def report_extract_entry(self, url):
3454 """Report downloading extry"""
3455 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3457 def report_date(self, upload_date):
3458 """Report downloading extry"""
3459 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3461 def report_uploader(self, uploader):
3462 """Report downloading extry"""
3463 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3465 def report_title(self, video_title):
3466 """Report downloading extry"""
3467 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3469 def report_extract_vid_page(self, video_page):
3470 """Report information extraction."""
3471 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3473 def _real_extract(self, url):
3474 # Extract id from URL
3475 mobj = re.match(self._VALID_URL, url)
3477 self._downloader.report_error(u'Invalid URL: %s' % url)
3480 post_url = mobj.group(0)
3481 video_id = mobj.group(1)
3483 video_extension = 'flv'
3485 # Step 1, Retrieve post webpage to extract further information
3486 self.report_extract_entry(post_url)
3487 request = compat_urllib_request.Request(post_url)
3489 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3490 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3491 self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3494 # Extract update date
3496 pattern = 'title="Timestamp">(.*?)</a>'
3497 mobj = re.search(pattern, webpage)
3499 upload_date = mobj.group(1)
3500 # Convert timestring to a format suitable for filename
3501 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3502 upload_date = upload_date.strftime('%Y%m%d')
3503 self.report_date(upload_date)
3507 pattern = r'rel\="author".*?>(.*?)</a>'
3508 mobj = re.search(pattern, webpage)
3510 uploader = mobj.group(1)
3511 self.report_uploader(uploader)
3514 # Get the first line for title
3516 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3517 mobj = re.search(pattern, webpage)
3519 video_title = mobj.group(1)
3520 self.report_title(video_title)
3522 # Step 2, Stimulate clicking the image box to launch video
3523 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3524 mobj = re.search(pattern, webpage)
3526 self._downloader.report_error(u'unable to extract video page URL')
3528 video_page = mobj.group(1)
3529 request = compat_urllib_request.Request(video_page)
3531 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3532 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3533 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3535 self.report_extract_vid_page(video_page)
3538 # Extract video links on video page
3539 """Extract video links of all sizes"""
3540 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3541 mobj = re.findall(pattern, webpage)
3543 self._downloader.report_error(u'unable to extract video links')
3545 # Sort in resolution
3546 links = sorted(mobj)
3548 # Choose the lowest of the sort, i.e. highest resolution
3549 video_url = links[-1]
3550 # Only get the url. The resolution part in the tuple has no use anymore
3551 video_url = video_url[-1]
3552 # Treat escaped \u0026 style hex
3554 video_url = video_url.decode("unicode_escape")
3555 except AttributeError: # Python 3
3556 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3562 'uploader': uploader,
3563 'upload_date': upload_date,
3564 'title': video_title,
3565 'ext': video_extension,
3568 class NBAIE(InfoExtractor):
3569 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3572 def _real_extract(self, url):
3573 mobj = re.match(self._VALID_URL, url)
3575 self._downloader.report_error(u'invalid URL: %s' % url)
3578 video_id = mobj.group(1)
3579 if video_id.endswith('/index.html'):
3580 video_id = video_id[:-len('/index.html')]
3582 webpage = self._download_webpage(url, video_id)
3584 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3585 def _findProp(rexp, default=None):
3586 m = re.search(rexp, webpage)
3588 return unescapeHTML(m.group(1))
3592 shortened_video_id = video_id.rpartition('/')[2]
3593 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3595 'id': shortened_video_id,
3599 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3600 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3604 class JustinTVIE(InfoExtractor):
3605 """Information extractor for justin.tv and twitch.tv"""
3606 # TODO: One broadcast may be split into multiple videos. The key
3607 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3608 # starts at 1 and increases. Can we treat all parts as one video?
3610 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3611 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3612 _JUSTIN_PAGE_LIMIT = 100
3613 IE_NAME = u'justin.tv'
3615 def report_extraction(self, file_id):
3616 """Report information extraction."""
3617 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3619 def report_download_page(self, channel, offset):
3620 """Report attempt to download a single page of videos."""
3621 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3622 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3624 # Return count of items, list of *valid* items
3625 def _parse_page(self, url):
3627 urlh = compat_urllib_request.urlopen(url)
3628 webpage_bytes = urlh.read()
3629 webpage = webpage_bytes.decode('utf-8', 'ignore')
3630 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3631 self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3634 response = json.loads(webpage)
3635 if type(response) != list:
3636 error_text = response.get('error', 'unknown error')
3637 self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3640 for clip in response:
3641 video_url = clip['video_file_url']
3643 video_extension = os.path.splitext(video_url)[1][1:]
3644 video_date = re.sub('-', '', clip['start_time'][:10])
3645 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3646 video_id = clip['id']
3647 video_title = clip.get('title', video_id)
3651 'title': video_title,
3652 'uploader': clip.get('channel_name', video_uploader_id),
3653 'uploader_id': video_uploader_id,
3654 'upload_date': video_date,
3655 'ext': video_extension,
3657 return (len(response), info)
3659 def _real_extract(self, url):
3660 mobj = re.match(self._VALID_URL, url)
3662 self._downloader.report_error(u'invalid URL: %s' % url)
3665 api = 'http://api.justin.tv'
3666 video_id = mobj.group(mobj.lastindex)
3668 if mobj.lastindex == 1:
3670 api += '/channel/archives/%s.json'
3672 api += '/broadcast/by_archive/%s.json'
3673 api = api % (video_id,)
3675 self.report_extraction(video_id)
3679 limit = self._JUSTIN_PAGE_LIMIT
3682 self.report_download_page(video_id, offset)
3683 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3684 page_count, page_info = self._parse_page(page_url)
3685 info.extend(page_info)
3686 if not paged or page_count != limit:
3691 class FunnyOrDieIE(InfoExtractor):
3692 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3694 def _real_extract(self, url):
3695 mobj = re.match(self._VALID_URL, url)
3697 self._downloader.report_error(u'invalid URL: %s' % url)
3700 video_id = mobj.group('id')
3701 webpage = self._download_webpage(url, video_id)
3703 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3705 self._downloader.report_error(u'unable to find video information')
3706 video_url = unescapeHTML(m.group('url'))
3708 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3710 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3712 self._downloader.trouble(u'Cannot find video title')
3713 title = clean_html(m.group('title'))
3715 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3717 desc = unescapeHTML(m.group('desc'))
3726 'description': desc,
3730 class SteamIE(InfoExtractor):
3731 _VALID_URL = r"""http://store.steampowered.com/
3732 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3734 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3738 def suitable(cls, url):
3739 """Receives a URL and returns True if suitable for this IE."""
3740 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3742 def _real_extract(self, url):
3743 m = re.match(self._VALID_URL, url, re.VERBOSE)
3744 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3745 gameID = m.group('gameID')
3746 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3747 webpage = self._download_webpage(videourl, gameID)
3748 mweb = re.finditer(urlRE, webpage)
3749 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3750 titles = re.finditer(namesRE, webpage)
3751 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3752 thumbs = re.finditer(thumbsRE, webpage)
3754 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3755 video_id = vid.group('videoID')
3756 title = vtitle.group('videoName')
3757 video_url = vid.group('videoURL')
3758 video_thumb = thumb.group('thumbnail')
3760 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3765 'title': unescapeHTML(title),
3766 'thumbnail': video_thumb
3771 class UstreamIE(InfoExtractor):
3772 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3773 IE_NAME = u'ustream'
3775 def _real_extract(self, url):
3776 m = re.match(self._VALID_URL, url)
3777 video_id = m.group('videoID')
3778 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3779 webpage = self._download_webpage(url, video_id)
3780 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3781 title = m.group('title')
3782 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3783 uploader = m.group('uploader')
3789 'uploader': uploader
3793 class WorldStarHipHopIE(InfoExtractor):
3794 _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3795 IE_NAME = u'WorldStarHipHop'
3797 def _real_extract(self, url):
3798 _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3800 webpage_src = compat_urllib_request.urlopen(url).read()
3801 webpage_src = webpage_src.decode('utf-8')
3803 mobj = re.search(_src_url, webpage_src)
3805 m = re.match(self._VALID_URL, url)
3806 video_id = m.group('id')
3808 if mobj is not None:
3809 video_url = mobj.group()
3810 if 'mp4' in video_url:
3815 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3818 _title = r"""<title>(.*)</title>"""
3820 mobj = re.search(_title, webpage_src)
3822 if mobj is not None:
3823 title = mobj.group(1)
3825 title = 'World Start Hip Hop - %s' % time.ctime()
3827 _thumbnail = r"""rel="image_src" href="(.*)" />"""
3828 mobj = re.search(_thumbnail, webpage_src)
3830 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3831 if mobj is not None:
3832 thumbnail = mobj.group(1)
3834 _title = r"""candytitles.*>(.*)</span>"""
3835 mobj = re.search(_title, webpage_src)
3836 if mobj is not None:
3837 title = mobj.group(1)
3844 'thumbnail' : thumbnail,
3849 class RBMARadioIE(InfoExtractor):
3850 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3852 def _real_extract(self, url):
3853 m = re.match(self._VALID_URL, url)
3854 video_id = m.group('videoID')
3856 webpage = self._download_webpage(url, video_id)
3857 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3859 raise ExtractorError(u'Cannot find metadata')
3860 json_data = m.group(1)
3863 data = json.loads(json_data)
3864 except ValueError as e:
3865 raise ExtractorError(u'Invalid JSON: ' + str(e))
3867 video_url = data['akamai_url'] + '&cbr=256'
3868 url_parts = compat_urllib_parse_urlparse(video_url)
3869 video_ext = url_parts.path.rpartition('.')[2]
3874 'title': data['title'],
3875 'description': data.get('teaser_text'),
3876 'location': data.get('country_of_origin'),
3877 'uploader': data.get('host', {}).get('name'),
3878 'uploader_id': data.get('host', {}).get('slug'),
3879 'thumbnail': data.get('image', {}).get('large_url_2x'),
3880 'duration': data.get('duration'),
3885 class YouPornIE(InfoExtractor):
3886 """Information extractor for youporn.com."""
3887 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3889 def _print_formats(self, formats):
3890 """Print all available formats"""
3891 print(u'Available formats:')
3892 print(u'ext\t\tformat')
3893 print(u'---------------------------------')
3894 for format in formats:
3895 print(u'%s\t\t%s' % (format['ext'], format['format']))
3897 def _specific(self, req_format, formats):
3899 if(x["format"]==req_format):
3903 def _real_extract(self, url):
3904 mobj = re.match(self._VALID_URL, url)
3906 self._downloader.report_error(u'invalid URL: %s' % url)
3909 video_id = mobj.group('videoid')
3911 req = compat_urllib_request.Request(url)
3912 req.add_header('Cookie', 'age_verified=1')
3913 webpage = self._download_webpage(req, video_id)
3915 # Get the video title
3916 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3918 raise ExtractorError(u'Unable to extract video title')
3919 video_title = result.group('title').strip()
3921 # Get the video date
3922 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3924 self._downloader.report_warning(u'unable to extract video date')
3927 upload_date = result.group('date').strip()
3929 # Get the video uploader
3930 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3932 self._downloader.report_warning(u'unable to extract uploader')
3933 video_uploader = None
3935 video_uploader = result.group('uploader').strip()
3936 video_uploader = clean_html( video_uploader )
3938 # Get all of the formats available
3939 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3940 result = re.search(DOWNLOAD_LIST_RE, webpage)
3942 raise ExtractorError(u'Unable to extract download list')
3943 download_list_html = result.group('download_list').strip()
3945 # Get all of the links from the page
3946 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3947 links = re.findall(LINK_RE, download_list_html)
3948 if(len(links) == 0):
3949 raise ExtractorError(u'ERROR: no known formats available for video')
3951 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3956 # A link looks like this:
3957 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3958 # A path looks like this:
3959 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3960 video_url = unescapeHTML( link )
3961 path = compat_urllib_parse_urlparse( video_url ).path
3962 extension = os.path.splitext( path )[1][1:]
3963 format = path.split('/')[4].split('_')[:2]
3966 format = "-".join( format )
3967 title = u'%s-%s-%s' % (video_title, size, bitrate)
3972 'uploader': video_uploader,
3973 'upload_date': upload_date,
3978 'description': None,
3982 if self._downloader.params.get('listformats', None):
3983 self._print_formats(formats)
3986 req_format = self._downloader.params.get('format', None)
3987 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3989 if req_format is None or req_format == 'best':
3991 elif req_format == 'worst':
3992 return [formats[-1]]
3993 elif req_format in ('-1', 'all'):
3996 format = self._specific( req_format, formats )
3998 self._downloader.report_error(u'requested format not available')
4004 class PornotubeIE(InfoExtractor):
4005 """Information extractor for pornotube.com."""
4006 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
4008 def _real_extract(self, url):
4009 mobj = re.match(self._VALID_URL, url)
4011 self._downloader.report_error(u'invalid URL: %s' % url)
4014 video_id = mobj.group('videoid')
4015 video_title = mobj.group('title')
4017 # Get webpage content
4018 webpage = self._download_webpage(url, video_id)
4021 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
4022 result = re.search(VIDEO_URL_RE, webpage)
4024 self._downloader.report_error(u'unable to extract video url')
4026 video_url = compat_urllib_parse.unquote(result.group('url'))
4028 #Get the uploaded date
4029 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4030 result = re.search(VIDEO_UPLOADED_RE, webpage)
4032 self._downloader.report_error(u'unable to extract video title')
4034 upload_date = result.group('date')
4036 info = {'id': video_id,
4039 'upload_date': upload_date,
4040 'title': video_title,
4046 class YouJizzIE(InfoExtractor):
4047 """Information extractor for youjizz.com."""
4048 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
4050 def _real_extract(self, url):
4051 mobj = re.match(self._VALID_URL, url)
4053 self._downloader.report_error(u'invalid URL: %s' % url)
4056 video_id = mobj.group('videoid')
4058 # Get webpage content
4059 webpage = self._download_webpage(url, video_id)
4061 # Get the video title
4062 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
4064 raise ExtractorError(u'ERROR: unable to extract video title')
4065 video_title = result.group('title').strip()
4067 # Get the embed page
4068 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
4070 raise ExtractorError(u'ERROR: unable to extract embed page')
4072 embed_page_url = result.group(0).strip()
4073 video_id = result.group('videoid')
4075 webpage = self._download_webpage(embed_page_url, video_id)
4078 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
4080 raise ExtractorError(u'ERROR: unable to extract video url')
4081 video_url = result.group('source')
4083 info = {'id': video_id,
4085 'title': video_title,
4088 'player_url': embed_page_url}
4092 class EightTracksIE(InfoExtractor):
4094 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4096 def _real_extract(self, url):
4097 mobj = re.match(self._VALID_URL, url)
4099 raise ExtractorError(u'Invalid URL: %s' % url)
4100 playlist_id = mobj.group('id')
4102 webpage = self._download_webpage(url, playlist_id)
4104 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4106 raise ExtractorError(u'Cannot find trax information')
4107 json_like = m.group(1)
4108 data = json.loads(json_like)
4110 session = str(random.randint(0, 1000000000))
4112 track_count = data['tracks_count']
4113 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4114 next_url = first_url
4116 for i in itertools.count():
4117 api_json = self._download_webpage(next_url, playlist_id,
4118 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4119 errnote=u'Failed to download song information')
4120 api_data = json.loads(api_json)
4121 track_data = api_data[u'set']['track']
4123 'id': track_data['id'],
4124 'url': track_data['track_file_stream_url'],
4125 'title': track_data['performer'] + u' - ' + track_data['name'],
4126 'raw_title': track_data['name'],
4127 'uploader_id': data['user']['login'],
4131 if api_data['set']['at_last_track']:
4133 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4136 class KeekIE(InfoExtractor):
4137 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4140 def _real_extract(self, url):
4141 m = re.match(self._VALID_URL, url)
4142 video_id = m.group('videoID')
4143 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4144 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4145 webpage = self._download_webpage(url, video_id)
4146 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4147 title = unescapeHTML(m.group('title'))
4148 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4149 uploader = clean_html(m.group('uploader'))
4155 'thumbnail': thumbnail,
4156 'uploader': uploader
4160 class TEDIE(InfoExtractor):
4161 _VALID_URL=r'''http://www.ted.com/
4163 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4165 ((?P<type_talk>talks)) # We have a simple talk
4167 /(?P<name>\w+) # Here goes the name and then ".html"
4171 def suitable(cls, url):
4172 """Receives a URL and returns True if suitable for this IE."""
4173 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4175 def _real_extract(self, url):
4176 m=re.match(self._VALID_URL, url, re.VERBOSE)
4177 if m.group('type_talk'):
4178 return [self._talk_info(url)]
4180 playlist_id=m.group('playlist_id')
4181 name=m.group('name')
4182 self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4183 return [self._playlist_videos_info(url,name,playlist_id)]
4185 def _talk_video_link(self,mediaSlug):
4186 '''Returns the video link for that mediaSlug'''
4187 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4189 def _playlist_videos_info(self,url,name,playlist_id=0):
4190 '''Returns the videos of the playlist'''
4192 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4193 ([.\s]*?)data-playlist_item_id="(\d+)"
4194 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4196 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4197 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4198 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4199 m_names=re.finditer(video_name_RE,webpage)
4201 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
4202 m_playlist = re.search(playlist_RE, webpage)
4203 playlist_title = m_playlist.group('playlist_title')
4205 playlist_entries = []
4206 for m_video, m_name in zip(m_videos,m_names):
4207 video_id=m_video.group('video_id')
4208 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4209 playlist_entries.append(self.url_result(talk_url, 'TED'))
4210 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
4212 def _talk_info(self, url, video_id=0):
4213 """Return the video for the talk in the url"""
4214 m=re.match(self._VALID_URL, url,re.VERBOSE)
4215 videoName=m.group('name')
4216 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4217 # If the url includes the language we get the title translated
4218 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4219 title=re.search(title_RE, webpage).group('title')
4220 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4221 "id":(?P<videoID>[\d]+).*?
4222 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4223 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4224 thumb_match=re.search(thumb_RE,webpage)
4225 info_match=re.search(info_RE,webpage,re.VERBOSE)
4226 video_id=info_match.group('videoID')
4227 mediaSlug=info_match.group('mediaSlug')
4228 video_url=self._talk_video_link(mediaSlug)
4234 'thumbnail': thumb_match.group('thumbnail')
4238 class MySpassIE(InfoExtractor):
4239 _VALID_URL = r'http://www.myspass.de/.*'
4241 def _real_extract(self, url):
4242 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4244 # video id is the last path element of the URL
4245 # usually there is a trailing slash, so also try the second but last
4246 url_path = compat_urllib_parse_urlparse(url).path
4247 url_parent_path, video_id = os.path.split(url_path)
4249 _, video_id = os.path.split(url_parent_path)
4252 metadata_url = META_DATA_URL_TEMPLATE % video_id
4253 metadata_text = self._download_webpage(metadata_url, video_id)
4254 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4256 # extract values from metadata
4257 url_flv_el = metadata.find('url_flv')
4258 if url_flv_el is None:
4259 self._downloader.report_error(u'unable to extract download url')
4261 video_url = url_flv_el.text
4262 extension = os.path.splitext(video_url)[1][1:]
4263 title_el = metadata.find('title')
4264 if title_el is None:
4265 self._downloader.report_error(u'unable to extract title')
4267 title = title_el.text
4268 format_id_el = metadata.find('format_id')
4269 if format_id_el is None:
4272 format = format_id_el.text
4273 description_el = metadata.find('description')
4274 if description_el is not None:
4275 description = description_el.text
4278 imagePreview_el = metadata.find('imagePreview')
4279 if imagePreview_el is not None:
4280 thumbnail = imagePreview_el.text
4289 'thumbnail': thumbnail,
4290 'description': description
4294 class SpiegelIE(InfoExtractor):
4295 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4297 def _real_extract(self, url):
4298 m = re.match(self._VALID_URL, url)
4299 video_id = m.group('videoID')
4301 webpage = self._download_webpage(url, video_id)
4302 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4304 raise ExtractorError(u'Cannot find title')
4305 video_title = unescapeHTML(m.group(1))
4307 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4308 xml_code = self._download_webpage(xml_url, video_id,
4309 note=u'Downloading XML', errnote=u'Failed to download XML')
4311 idoc = xml.etree.ElementTree.fromstring(xml_code)
4312 last_type = idoc[-1]
4313 filename = last_type.findall('./filename')[0].text
4314 duration = float(last_type.findall('./duration')[0].text)
4316 video_url = 'http://video2.spiegel.de/flash/' + filename
4317 video_ext = filename.rpartition('.')[2]
4322 'title': video_title,
4323 'duration': duration,
4327 class LiveLeakIE(InfoExtractor):
4329 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4330 IE_NAME = u'liveleak'
4332 def _real_extract(self, url):
4333 mobj = re.match(self._VALID_URL, url)
4335 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4338 video_id = mobj.group('video_id')
4340 webpage = self._download_webpage(url, video_id)
4342 m = re.search(r'file: "(.*?)",', webpage)
4344 self._downloader.report_error(u'unable to find video url')
4346 video_url = m.group(1)
4348 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4350 self._downloader.trouble(u'Cannot find video title')
4351 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4353 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4355 desc = unescapeHTML(m.group('desc'))
4359 m = re.search(r'By:.*?(\w+)</a>', webpage)
4361 uploader = clean_html(m.group(1))
4370 'description': desc,
4371 'uploader': uploader
4376 class ARDIE(InfoExtractor):
4377 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4378 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4379 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4381 def _real_extract(self, url):
4382 # determine video id from url
4383 m = re.match(self._VALID_URL, url)
4385 numid = re.search(r'documentId=([0-9]+)', url)
4387 video_id = numid.group(1)
4389 video_id = m.group('video_id')
4391 # determine title and media streams from webpage
4392 html = self._download_webpage(url, video_id)
4393 title = re.search(self._TITLE, html).group('title')
4394 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4396 assert '"fsk"' in html
4397 self._downloader.report_error(u'this video is only available after 8:00 pm')
4400 # choose default media type and highest quality for now
4401 stream = max([s for s in streams if int(s["media_type"]) == 0],
4402 key=lambda s: int(s["quality"]))
4404 # there's two possibilities: RTMP stream or HTTP download
4405 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4406 if stream['rtmp_url']:
4407 self._downloader.to_screen(u'[%s] RTMP download detected' % self.IE_NAME)
4408 assert stream['video_url'].startswith('mp4:')
4409 info["url"] = stream["rtmp_url"]
4410 info["play_path"] = stream['video_url']
4412 assert stream["video_url"].endswith('.mp4')
4413 info["url"] = stream["video_url"]
4417 def gen_extractors():
4418 """ Return a list of an instance of every supported extractor.
4419 The order does matter; the first extractor matched is the one handling the URL.
4422 YoutubePlaylistIE(),
4447 StanfordOpenClassroomIE(),
4457 WorldStarHipHopIE(),
4473 def get_info_extractor(ie_name):
4474 """Returns the info extractor class with the given ie_name"""
4475 return globals()[ie_name+'IE']