2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 self.report_download_webpage(video_id)
118 elif note is not False:
119 self.to_screen(u'%s: %s' % (video_id, note))
121 return compat_urllib_request.urlopen(url_or_request)
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
124 errnote = u'Unable to download webpage'
125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
127 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns the data of the page as a string """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
133 encoding = m.group(1)
136 webpage_bytes = urlh.read()
137 if self._downloader.params.get('dump_intermediate_pages', False):
139 url = url_or_request.get_full_url()
140 except AttributeError:
142 self.to_screen(u'Dumping request to ' + url)
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
145 return webpage_bytes.decode(encoding, 'replace')
147 def to_screen(self, msg):
148 """Print msg to screen, prefixing it with '[ie_name]'"""
149 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
151 def report_extraction(self, id_or_name):
152 """Report information extraction."""
153 self.to_screen(u'%s: Extracting information' % id_or_name)
155 def report_download_webpage(self, video_id):
156 """Report webpage download."""
157 self.to_screen(u'%s: Downloading webpage' % video_id)
159 def report_age_confirmation(self):
160 """Report attempt to confirm age."""
161 self.to_screen(u'Confirming age')
163 #Methods for following #608
164 #They set the correct value of the '_type' key
165 def video_result(self, video_info):
166 """Returns a video"""
167 video_info['_type'] = 'video'
169 def url_result(self, url, ie=None):
170 """Returns a url that points to a page that should be processed"""
171 #TODO: ie should be the class used for getting the info
172 video_info = {'_type': 'url',
176 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
177 """Returns a playlist"""
178 video_info = {'_type': 'playlist',
181 video_info['id'] = playlist_id
183 video_info['title'] = playlist_title
187 class YoutubeIE(InfoExtractor):
188 """Information extractor for youtube.com."""
192 (?:https?://)? # http(s):// (optional)
193 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
194 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
195 (?:.*?\#/)? # handle anchor (#/) redirect urls
196 (?: # the various things that can precede the ID:
197 (?:(?:v|embed|e)/) # v/ or embed/ or e/
198 |(?: # or the v= param in all its forms
199 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
200 (?:\?|\#!?) # the params delimiter ? or # or #!
201 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
204 )? # optional -> youtube.com/xxxx is OK
205 )? # all until now is optional -> you can pass the naked ID
206 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
207 (?(1).+)? # if we found the ID, everything can follow
209 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
210 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
211 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
212 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
213 _NETRC_MACHINE = 'youtube'
214 # Listed in order of quality
215 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
216 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
217 _video_extensions = {
223 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
229 _video_dimensions = {
248 def suitable(cls, url):
249 """Receives a URL and returns True if suitable for this IE."""
250 if YoutubePlaylistIE.suitable(url): return False
251 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
253 def report_lang(self):
254 """Report attempt to set language."""
255 self.to_screen(u'Setting language')
257 def report_login(self):
258 """Report attempt to log in."""
259 self.to_screen(u'Logging in')
261 def report_video_webpage_download(self, video_id):
262 """Report attempt to download video webpage."""
263 self.to_screen(u'%s: Downloading video webpage' % video_id)
265 def report_video_info_webpage_download(self, video_id):
266 """Report attempt to download video info webpage."""
267 self.to_screen(u'%s: Downloading video info webpage' % video_id)
269 def report_video_subtitles_download(self, video_id):
270 """Report attempt to download video info webpage."""
271 self.to_screen(u'%s: Checking available subtitles' % video_id)
273 def report_video_subtitles_request(self, video_id, sub_lang, format):
274 """Report attempt to download video info webpage."""
275 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
277 def report_video_subtitles_available(self, video_id, sub_lang_list):
278 """Report available subtitles."""
279 sub_lang = ",".join(list(sub_lang_list.keys()))
280 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
282 def report_information_extraction(self, video_id):
283 """Report attempt to extract video information."""
284 self.to_screen(u'%s: Extracting video information' % video_id)
286 def report_unavailable_format(self, video_id, format):
287 """Report extracted video URL."""
288 self.to_screen(u'%s: Format %s not available' % (video_id, format))
290 def report_rtmp_download(self):
291 """Indicate the download will use the RTMP protocol."""
292 self.to_screen(u'RTMP download detected')
294 def _get_available_subtitles(self, video_id):
295 self.report_video_subtitles_download(video_id)
296 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
298 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
299 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
300 return (u'unable to download video subtitles: %s' % compat_str(err), None)
301 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
302 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
303 if not sub_lang_list:
304 return (u'video doesn\'t have subtitles', None)
307 def _list_available_subtitles(self, video_id):
308 sub_lang_list = self._get_available_subtitles(video_id)
309 self.report_video_subtitles_available(video_id, sub_lang_list)
311 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
314 (error_message, sub_lang, sub)
316 self.report_video_subtitles_request(video_id, sub_lang, format)
317 params = compat_urllib_parse.urlencode({
323 url = 'http://www.youtube.com/api/timedtext?' + params
325 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
326 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
327 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
329 return (u'Did not fetch video subtitles', None, None)
330 return (None, sub_lang, sub)
332 def _extract_subtitle(self, video_id):
334 Return a list with a tuple:
335 [(error_message, sub_lang, sub)]
337 sub_lang_list = self._get_available_subtitles(video_id)
338 sub_format = self._downloader.params.get('subtitlesformat')
339 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
340 return [(sub_lang_list[0], None, None)]
341 if self._downloader.params.get('subtitleslang', False):
342 sub_lang = self._downloader.params.get('subtitleslang')
343 elif 'en' in sub_lang_list:
346 sub_lang = list(sub_lang_list.keys())[0]
347 if not sub_lang in sub_lang_list:
348 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
350 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
353 def _extract_all_subtitles(self, video_id):
354 sub_lang_list = self._get_available_subtitles(video_id)
355 sub_format = self._downloader.params.get('subtitlesformat')
356 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
357 return [(sub_lang_list[0], None, None)]
359 for sub_lang in sub_lang_list:
360 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
361 subtitles.append(subtitle)
364 def _print_formats(self, formats):
365 print('Available formats:')
367 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
369 def _real_initialize(self):
370 if self._downloader is None:
375 downloader_params = self._downloader.params
377 # Attempt to use provided username and password or .netrc data
378 if downloader_params.get('username', None) is not None:
379 username = downloader_params['username']
380 password = downloader_params['password']
381 elif downloader_params.get('usenetrc', False):
383 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
388 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
389 except (IOError, netrc.NetrcParseError) as err:
390 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
394 request = compat_urllib_request.Request(self._LANG_URL)
397 compat_urllib_request.urlopen(request).read()
398 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
399 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
402 # No authentication to be performed
406 request = compat_urllib_request.Request(self._LOGIN_URL)
408 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
409 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
410 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
415 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
417 galx = match.group(1)
419 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
425 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
429 u'PersistentCookie': u'yes',
431 u'bgresponse': u'js_disabled',
432 u'checkConnection': u'',
433 u'checkedDomains': u'youtube',
439 u'signIn': u'Sign in',
441 u'service': u'youtube',
445 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
447 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
448 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
449 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
452 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
453 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
454 self._downloader.report_warning(u'unable to log in: bad username or password')
456 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
457 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
463 'action_confirm': 'Confirm',
465 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
467 self.report_age_confirmation()
468 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
469 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
470 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
473 def _extract_id(self, url):
474 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
476 self._downloader.report_error(u'invalid URL: %s' % url)
478 video_id = mobj.group(2)
481 def _real_extract(self, url):
482 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
483 mobj = re.search(self._NEXT_URL_RE, url)
485 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
486 video_id = self._extract_id(url)
489 self.report_video_webpage_download(video_id)
490 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
491 request = compat_urllib_request.Request(url)
493 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
494 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
495 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
498 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
500 # Attempt to extract SWF player URL
501 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
503 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
508 self.report_video_info_webpage_download(video_id)
509 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
510 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
511 % (video_id, el_type))
512 video_info_webpage = self._download_webpage(video_info_url, video_id,
514 errnote='unable to download video info webpage')
515 video_info = compat_parse_qs(video_info_webpage)
516 if 'token' in video_info:
518 if 'token' not in video_info:
519 if 'reason' in video_info:
520 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
522 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
525 # Check for "rental" videos
526 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
527 self._downloader.report_error(u'"rental" videos not supported')
530 # Start extracting information
531 self.report_information_extraction(video_id)
534 if 'author' not in video_info:
535 self._downloader.report_error(u'unable to extract uploader name')
537 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
540 video_uploader_id = None
541 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
543 video_uploader_id = mobj.group(1)
545 self._downloader.report_warning(u'unable to extract uploader nickname')
548 if 'title' not in video_info:
549 self._downloader.report_error(u'unable to extract video title')
551 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
554 if 'thumbnail_url' not in video_info:
555 self._downloader.report_warning(u'unable to extract video thumbnail')
557 else: # don't panic if we can't find it
558 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
562 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
564 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
565 upload_date = unified_strdate(upload_date)
568 video_description = get_element_by_id("eow-description", video_webpage)
569 if video_description:
570 video_description = clean_html(video_description)
572 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
574 video_description = unescapeHTML(fd_mobj.group(1))
576 video_description = u''
579 video_subtitles = None
581 if self._downloader.params.get('writesubtitles', False):
582 video_subtitles = self._extract_subtitle(video_id)
584 (sub_error, sub_lang, sub) = video_subtitles[0]
586 self._downloader.report_error(sub_error)
588 if self._downloader.params.get('allsubtitles', False):
589 video_subtitles = self._extract_all_subtitles(video_id)
590 for video_subtitle in video_subtitles:
591 (sub_error, sub_lang, sub) = video_subtitle
593 self._downloader.report_error(sub_error)
595 if self._downloader.params.get('listsubtitles', False):
596 sub_lang_list = self._list_available_subtitles(video_id)
599 if 'length_seconds' not in video_info:
600 self._downloader.report_warning(u'unable to extract video duration')
603 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
606 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
608 # Decide which formats to download
609 req_format = self._downloader.params.get('format', None)
611 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
612 self.report_rtmp_download()
613 video_url_list = [(None, video_info['conn'][0])]
614 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
615 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
616 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
617 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
618 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
620 format_limit = self._downloader.params.get('format_limit', None)
621 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
622 if format_limit is not None and format_limit in available_formats:
623 format_list = available_formats[available_formats.index(format_limit):]
625 format_list = available_formats
626 existing_formats = [x for x in format_list if x in url_map]
627 if len(existing_formats) == 0:
628 raise ExtractorError(u'no known formats available for video')
629 if self._downloader.params.get('listformats', None):
630 self._print_formats(existing_formats)
632 if req_format is None or req_format == 'best':
633 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
634 elif req_format == 'worst':
635 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
636 elif req_format in ('-1', 'all'):
637 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
639 # Specific formats. We pick the first in a slash-delimeted sequence.
640 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
641 req_formats = req_format.split('/')
642 video_url_list = None
643 for rf in req_formats:
645 video_url_list = [(rf, url_map[rf])]
647 if video_url_list is None:
648 raise ExtractorError(u'requested format not available')
650 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
653 for format_param, video_real_url in video_url_list:
655 video_extension = self._video_extensions.get(format_param, 'flv')
657 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
658 self._video_dimensions.get(format_param, '???'))
662 'url': video_real_url,
663 'uploader': video_uploader,
664 'uploader_id': video_uploader_id,
665 'upload_date': upload_date,
666 'title': video_title,
667 'ext': video_extension,
668 'format': video_format,
669 'thumbnail': video_thumbnail,
670 'description': video_description,
671 'player_url': player_url,
672 'subtitles': video_subtitles,
673 'duration': video_duration
678 class MetacafeIE(InfoExtractor):
679 """Information Extractor for metacafe.com."""
681 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
682 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
683 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
684 IE_NAME = u'metacafe'
686 def report_disclaimer(self):
687 """Report disclaimer retrieval."""
688 self.to_screen(u'Retrieving disclaimer')
690 def _real_initialize(self):
691 # Retrieve disclaimer
692 request = compat_urllib_request.Request(self._DISCLAIMER)
694 self.report_disclaimer()
695 disclaimer = compat_urllib_request.urlopen(request).read()
696 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
697 self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
703 'submit': "Continue - I'm over 18",
705 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
707 self.report_age_confirmation()
708 disclaimer = compat_urllib_request.urlopen(request).read()
709 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
710 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
713 def _real_extract(self, url):
714 # Extract id and simplified title from URL
715 mobj = re.match(self._VALID_URL, url)
717 self._downloader.report_error(u'invalid URL: %s' % url)
720 video_id = mobj.group(1)
722 # Check if video comes from YouTube
723 mobj2 = re.match(r'^yt-(.*)$', video_id)
724 if mobj2 is not None:
725 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
727 # Retrieve video webpage to extract further information
728 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
730 # Extract URL, uploader and title from webpage
731 self.report_extraction(video_id)
732 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
734 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
735 video_extension = mediaURL[-3:]
737 # Extract gdaKey if available
738 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
742 gdaKey = mobj.group(1)
743 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
745 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
747 self._downloader.report_error(u'unable to extract media URL')
749 vardict = compat_parse_qs(mobj.group(1))
750 if 'mediaData' not in vardict:
751 self._downloader.report_error(u'unable to extract media URL')
753 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
755 self._downloader.report_error(u'unable to extract media URL')
757 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
758 video_extension = mediaURL[-3:]
759 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
761 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
763 self._downloader.report_error(u'unable to extract title')
765 video_title = mobj.group(1).decode('utf-8')
767 mobj = re.search(r'submitter=(.*?);', webpage)
769 self._downloader.report_error(u'unable to extract uploader nickname')
771 video_uploader = mobj.group(1)
774 'id': video_id.decode('utf-8'),
775 'url': video_url.decode('utf-8'),
776 'uploader': video_uploader.decode('utf-8'),
778 'title': video_title,
779 'ext': video_extension.decode('utf-8'),
783 class DailymotionIE(InfoExtractor):
784 """Information Extractor for Dailymotion"""
786 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
787 IE_NAME = u'dailymotion'
790 def _real_extract(self, url):
791 # Extract id and simplified title from URL
792 mobj = re.match(self._VALID_URL, url)
794 self._downloader.report_error(u'invalid URL: %s' % url)
797 video_id = mobj.group(1).split('_')[0].split('?')[0]
799 video_extension = 'mp4'
801 # Retrieve video webpage to extract further information
802 request = compat_urllib_request.Request(url)
803 request.add_header('Cookie', 'family_filter=off')
804 webpage = self._download_webpage(request, video_id)
806 # Extract URL, uploader and title from webpage
807 self.report_extraction(video_id)
808 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
810 self._downloader.report_error(u'unable to extract media URL')
812 flashvars = compat_urllib_parse.unquote(mobj.group(1))
814 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
817 self.to_screen(u'Using %s' % key)
820 self._downloader.report_error(u'unable to extract video URL')
823 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
825 self._downloader.report_error(u'unable to extract video URL')
828 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
830 # TODO: support choosing qualities
832 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
834 self._downloader.report_error(u'unable to extract title')
836 video_title = unescapeHTML(mobj.group('title'))
838 video_uploader = None
839 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
841 # lookin for official user
842 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
843 if mobj_official is None:
844 self._downloader.report_warning(u'unable to extract uploader nickname')
846 video_uploader = mobj_official.group(1)
848 video_uploader = mobj.group(1)
850 video_upload_date = None
851 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
853 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
858 'uploader': video_uploader,
859 'upload_date': video_upload_date,
860 'title': video_title,
861 'ext': video_extension,
865 class PhotobucketIE(InfoExtractor):
866 """Information extractor for photobucket.com."""
868 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
869 IE_NAME = u'photobucket'
871 def _real_extract(self, url):
872 # Extract id from URL
873 mobj = re.match(self._VALID_URL, url)
875 self._downloader.report_error(u'Invalid URL: %s' % url)
878 video_id = mobj.group(1)
880 video_extension = 'flv'
882 # Retrieve video webpage to extract further information
883 request = compat_urllib_request.Request(url)
885 self.report_download_webpage(video_id)
886 webpage = compat_urllib_request.urlopen(request).read()
887 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
888 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
891 # Extract URL, uploader, and title from webpage
892 self.report_extraction(video_id)
893 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
895 self._downloader.report_error(u'unable to extract media URL')
897 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
901 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
903 self._downloader.report_error(u'unable to extract title')
905 video_title = mobj.group(1).decode('utf-8')
907 video_uploader = mobj.group(2).decode('utf-8')
910 'id': video_id.decode('utf-8'),
911 'url': video_url.decode('utf-8'),
912 'uploader': video_uploader,
914 'title': video_title,
915 'ext': video_extension.decode('utf-8'),
919 class YahooIE(InfoExtractor):
920 """Information extractor for video.yahoo.com."""
923 # _VALID_URL matches all Yahoo! Video URLs
924 # _VPAGE_URL matches only the extractable '/watch/' URLs
925 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
926 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
927 IE_NAME = u'video.yahoo'
929 def _real_extract(self, url, new_video=True):
930 # Extract ID from URL
931 mobj = re.match(self._VALID_URL, url)
933 self._downloader.report_error(u'Invalid URL: %s' % url)
936 video_id = mobj.group(2)
937 video_extension = 'flv'
939 # Rewrite valid but non-extractable URLs as
940 # extractable English language /watch/ URLs
941 if re.match(self._VPAGE_URL, url) is None:
942 request = compat_urllib_request.Request(url)
944 webpage = compat_urllib_request.urlopen(request).read()
945 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
946 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
949 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
951 self._downloader.report_error(u'Unable to extract id field')
953 yahoo_id = mobj.group(1)
955 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
957 self._downloader.report_error(u'Unable to extract vid field')
959 yahoo_vid = mobj.group(1)
961 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
962 return self._real_extract(url, new_video=False)
964 # Retrieve video webpage to extract further information
965 request = compat_urllib_request.Request(url)
967 self.report_download_webpage(video_id)
968 webpage = compat_urllib_request.urlopen(request).read()
969 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
970 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
973 # Extract uploader and title from webpage
974 self.report_extraction(video_id)
975 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
977 self._downloader.report_error(u'unable to extract video title')
979 video_title = mobj.group(1).decode('utf-8')
981 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
983 self._downloader.report_error(u'unable to extract video uploader')
985 video_uploader = mobj.group(1).decode('utf-8')
987 # Extract video thumbnail
988 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
990 self._downloader.report_error(u'unable to extract video thumbnail')
992 video_thumbnail = mobj.group(1).decode('utf-8')
994 # Extract video description
995 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
997 self._downloader.report_error(u'unable to extract video description')
999 video_description = mobj.group(1).decode('utf-8')
1000 if not video_description:
1001 video_description = 'No description available.'
1003 # Extract video height and width
1004 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1006 self._downloader.report_error(u'unable to extract video height')
1008 yv_video_height = mobj.group(1)
1010 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1012 self._downloader.report_error(u'unable to extract video width')
1014 yv_video_width = mobj.group(1)
1016 # Retrieve video playlist to extract media URL
1017 # I'm not completely sure what all these options are, but we
1018 # seem to need most of them, otherwise the server sends a 401.
1019 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1020 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1021 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1022 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1023 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1025 self.report_download_webpage(video_id)
1026 webpage = compat_urllib_request.urlopen(request).read()
1027 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1028 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1031 # Extract media URL from playlist XML
1032 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1034 self._downloader.report_error(u'Unable to extract media URL')
1036 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1037 video_url = unescapeHTML(video_url)
1040 'id': video_id.decode('utf-8'),
1042 'uploader': video_uploader,
1043 'upload_date': None,
1044 'title': video_title,
1045 'ext': video_extension.decode('utf-8'),
1046 'thumbnail': video_thumbnail.decode('utf-8'),
1047 'description': video_description,
1051 class VimeoIE(InfoExtractor):
1052 """Information extractor for vimeo.com."""
1054 # _VALID_URL matches Vimeo URLs
1055 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1058 def _real_extract(self, url, new_video=True):
1059 # Extract ID from URL
1060 mobj = re.match(self._VALID_URL, url)
1062 self._downloader.report_error(u'Invalid URL: %s' % url)
1065 video_id = mobj.group('id')
1066 if not mobj.group('proto'):
1067 url = 'https://' + url
1068 if mobj.group('direct_link'):
1069 url = 'https://vimeo.com/' + video_id
1071 # Retrieve video webpage to extract further information
1072 request = compat_urllib_request.Request(url, None, std_headers)
1074 self.report_download_webpage(video_id)
1075 webpage_bytes = compat_urllib_request.urlopen(request).read()
1076 webpage = webpage_bytes.decode('utf-8')
1077 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1078 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1081 # Now we begin extracting as much information as we can from what we
1082 # retrieved. First we extract the information common to all extractors,
1083 # and latter we extract those that are Vimeo specific.
1084 self.report_extraction(video_id)
1086 # Extract the config JSON
1088 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1089 config = json.loads(config)
1091 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1092 self._downloader.report_error(u'The author has restricted the access to this video, try with the "--referer" option')
1094 self._downloader.report_error(u'unable to extract info section')
1098 video_title = config["video"]["title"]
1100 # Extract uploader and uploader_id
1101 video_uploader = config["video"]["owner"]["name"]
1102 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1104 # Extract video thumbnail
1105 video_thumbnail = config["video"]["thumbnail"]
1107 # Extract video description
1108 video_description = get_element_by_attribute("itemprop", "description", webpage)
1109 if video_description: video_description = clean_html(video_description)
1110 else: video_description = u''
1112 # Extract upload date
1113 video_upload_date = None
1114 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1115 if mobj is not None:
1116 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1118 # Vimeo specific: extract request signature and timestamp
1119 sig = config['request']['signature']
1120 timestamp = config['request']['timestamp']
1122 # Vimeo specific: extract video codec and quality information
1123 # First consider quality, then codecs, then take everything
1124 # TODO bind to format param
1125 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1126 files = { 'hd': [], 'sd': [], 'other': []}
1127 for codec_name, codec_extension in codecs:
1128 if codec_name in config["video"]["files"]:
1129 if 'hd' in config["video"]["files"][codec_name]:
1130 files['hd'].append((codec_name, codec_extension, 'hd'))
1131 elif 'sd' in config["video"]["files"][codec_name]:
1132 files['sd'].append((codec_name, codec_extension, 'sd'))
1134 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1136 for quality in ('hd', 'sd', 'other'):
1137 if len(files[quality]) > 0:
1138 video_quality = files[quality][0][2]
1139 video_codec = files[quality][0][0]
1140 video_extension = files[quality][0][1]
1141 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1144 self._downloader.report_error(u'no known codec found')
1147 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1148 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1153 'uploader': video_uploader,
1154 'uploader_id': video_uploader_id,
1155 'upload_date': video_upload_date,
1156 'title': video_title,
1157 'ext': video_extension,
1158 'thumbnail': video_thumbnail,
1159 'description': video_description,
1163 class ArteTvIE(InfoExtractor):
1164 """arte.tv information extractor."""
1166 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1167 _LIVE_URL = r'index-[0-9]+\.html$'
1169 IE_NAME = u'arte.tv'
1171 def fetch_webpage(self, url):
1172 request = compat_urllib_request.Request(url)
1174 self.report_download_webpage(url)
1175 webpage = compat_urllib_request.urlopen(request).read()
1176 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1177 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1179 except ValueError as err:
1180 self._downloader.report_error(u'Invalid URL: %s' % url)
1184 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1185 page = self.fetch_webpage(url)
1186 mobj = re.search(regex, page, regexFlags)
1190 self._downloader.report_error(u'Invalid URL: %s' % url)
1193 for (i, key, err) in matchTuples:
1194 if mobj.group(i) is None:
1195 self._downloader.report_error(err)
1198 info[key] = mobj.group(i)
1202 def extractLiveStream(self, url):
1203 video_lang = url.split('/')[-4]
1204 info = self.grep_webpage(
1206 r'src="(.*?/videothek_js.*?\.js)',
1209 (1, 'url', u'Invalid URL: %s' % url)
1212 http_host = url.split('/')[2]
1213 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1214 info = self.grep_webpage(
1216 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1217 '(http://.*?\.swf).*?' +
1221 (1, 'path', u'could not extract video path: %s' % url),
1222 (2, 'player', u'could not extract video player: %s' % url),
1223 (3, 'url', u'could not extract video url: %s' % url)
1226 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1228 def extractPlus7Stream(self, url):
1229 video_lang = url.split('/')[-3]
1230 info = self.grep_webpage(
1232 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1235 (1, 'url', u'Invalid URL: %s' % url)
1238 next_url = compat_urllib_parse.unquote(info.get('url'))
1239 info = self.grep_webpage(
1241 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1244 (1, 'url', u'Could not find <video> tag: %s' % url)
1247 next_url = compat_urllib_parse.unquote(info.get('url'))
1249 info = self.grep_webpage(
1251 r'<video id="(.*?)".*?>.*?' +
1252 '<name>(.*?)</name>.*?' +
1253 '<dateVideo>(.*?)</dateVideo>.*?' +
1254 '<url quality="hd">(.*?)</url>',
1257 (1, 'id', u'could not extract video id: %s' % url),
1258 (2, 'title', u'could not extract video title: %s' % url),
1259 (3, 'date', u'could not extract video date: %s' % url),
1260 (4, 'url', u'could not extract video url: %s' % url)
1265 'id': info.get('id'),
1266 'url': compat_urllib_parse.unquote(info.get('url')),
1267 'uploader': u'arte.tv',
1268 'upload_date': info.get('date'),
1269 'title': info.get('title').decode('utf-8'),
1275 def _real_extract(self, url):
1276 video_id = url.split('/')[-1]
1277 self.report_extraction(video_id)
1279 if re.search(self._LIVE_URL, video_id) is not None:
1280 self.extractLiveStream(url)
1283 info = self.extractPlus7Stream(url)
1288 class GenericIE(InfoExtractor):
1289 """Generic last-resort information extractor."""
1292 IE_NAME = u'generic'
1294 def report_download_webpage(self, video_id):
1295 """Report webpage download."""
1296 if not self._downloader.params.get('test', False):
1297 self._downloader.report_warning(u'Falling back on generic information extractor.')
1298 super(GenericIE, self).report_download_webpage(video_id)
1300 def report_following_redirect(self, new_url):
1301 """Report information extraction."""
1302 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1304 def _test_redirect(self, url):
1305 """Check if it is a redirect, like url shorteners, in case return the new url."""
1306 class HeadRequest(compat_urllib_request.Request):
1307 def get_method(self):
1310 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1312 Subclass the HTTPRedirectHandler to make it use our
1313 HeadRequest also on the redirected URL
1315 def redirect_request(self, req, fp, code, msg, headers, newurl):
1316 if code in (301, 302, 303, 307):
1317 newurl = newurl.replace(' ', '%20')
1318 newheaders = dict((k,v) for k,v in req.headers.items()
1319 if k.lower() not in ("content-length", "content-type"))
1320 return HeadRequest(newurl,
1322 origin_req_host=req.get_origin_req_host(),
1325 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1327 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1329 Fallback to GET if HEAD is not allowed (405 HTTP error)
1331 def http_error_405(self, req, fp, code, msg, headers):
1335 newheaders = dict((k,v) for k,v in req.headers.items()
1336 if k.lower() not in ("content-length", "content-type"))
1337 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1339 origin_req_host=req.get_origin_req_host(),
1343 opener = compat_urllib_request.OpenerDirector()
1344 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1345 HTTPMethodFallback, HEADRedirectHandler,
1346 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1347 opener.add_handler(handler())
1349 response = opener.open(HeadRequest(url))
1350 new_url = response.geturl()
1355 self.report_following_redirect(new_url)
1358 def _real_extract(self, url):
1359 new_url = self._test_redirect(url)
1360 if new_url: return [self.url_result(new_url)]
1362 video_id = url.split('/')[-1]
1364 webpage = self._download_webpage(url, video_id)
1365 except ValueError as err:
1366 # since this is the last-resort InfoExtractor, if
1367 # this error is thrown, it'll be thrown here
1368 self._downloader.report_error(u'Invalid URL: %s' % url)
1371 self.report_extraction(video_id)
1372 # Start with something easy: JW Player in SWFObject
1373 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1375 # Broaden the search a little bit
1376 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1378 # Broaden the search a little bit: JWPlayer JS loader
1379 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1381 self._downloader.report_error(u'Invalid URL: %s' % url)
1384 # It's possible that one of the regexes
1385 # matched, but returned an empty group:
1386 if mobj.group(1) is None:
1387 self._downloader.report_error(u'Invalid URL: %s' % url)
1390 video_url = compat_urllib_parse.unquote(mobj.group(1))
1391 video_id = os.path.basename(video_url)
1393 # here's a fun little line of code for you:
1394 video_extension = os.path.splitext(video_id)[1][1:]
1395 video_id = os.path.splitext(video_id)[0]
1397 # it's tempting to parse this further, but you would
1398 # have to take into account all the variations like
1399 # Video Title - Site Name
1400 # Site Name | Video Title
1401 # Video Title - Tagline | Site Name
1402 # and so on and so forth; it's just not practical
1403 mobj = re.search(r'<title>(.*)</title>', webpage)
1405 self._downloader.report_error(u'unable to extract title')
1407 video_title = mobj.group(1)
1409 # video uploader is domain name
1410 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1412 self._downloader.report_error(u'unable to extract title')
1414 video_uploader = mobj.group(1)
1419 'uploader': video_uploader,
1420 'upload_date': None,
1421 'title': video_title,
1422 'ext': video_extension,
1426 class YoutubeSearchIE(InfoExtractor):
1427 """Information Extractor for YouTube search queries."""
1428 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1429 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1430 _max_youtube_results = 1000
1431 IE_NAME = u'youtube:search'
1433 def report_download_page(self, query, pagenum):
1434 """Report attempt to download search page with given number."""
1435 query = query.decode(preferredencoding())
1436 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1438 def _real_extract(self, query):
1439 mobj = re.match(self._VALID_URL, query)
1441 self._downloader.report_error(u'invalid search query "%s"' % query)
1444 prefix, query = query.split(':')
1446 query = query.encode('utf-8')
1448 return self._get_n_results(query, 1)
1449 elif prefix == 'all':
1450 self._get_n_results(query, self._max_youtube_results)
1455 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1457 elif n > self._max_youtube_results:
1458 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1459 n = self._max_youtube_results
1460 return self._get_n_results(query, n)
1461 except ValueError: # parsing prefix as integer fails
1462 return self._get_n_results(query, 1)
1464 def _get_n_results(self, query, n):
1465 """Get a specified number of results for a query"""
1471 while (50 * pagenum) < limit:
1472 self.report_download_page(query, pagenum+1)
1473 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1474 request = compat_urllib_request.Request(result_url)
1476 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1477 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1478 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1480 api_response = json.loads(data)['data']
1482 if not 'items' in api_response:
1483 self._downloader.report_error(u'[youtube] No video results')
1486 new_ids = list(video['id'] for video in api_response['items'])
1487 video_ids += new_ids
1489 limit = min(n, api_response['totalItems'])
1492 if len(video_ids) > n:
1493 video_ids = video_ids[:n]
1494 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1498 class GoogleSearchIE(InfoExtractor):
1499 """Information Extractor for Google Video search queries."""
1500 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1501 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1502 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1503 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1504 _max_google_results = 1000
1505 IE_NAME = u'video.google:search'
1507 def report_download_page(self, query, pagenum):
1508 """Report attempt to download playlist page with given number."""
1509 query = query.decode(preferredencoding())
1510 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1512 def _real_extract(self, query):
1513 mobj = re.match(self._VALID_URL, query)
1515 self._downloader.report_error(u'invalid search query "%s"' % query)
1518 prefix, query = query.split(':')
1520 query = query.encode('utf-8')
1522 self._download_n_results(query, 1)
1524 elif prefix == 'all':
1525 self._download_n_results(query, self._max_google_results)
1531 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1533 elif n > self._max_google_results:
1534 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1535 n = self._max_google_results
1536 self._download_n_results(query, n)
1538 except ValueError: # parsing prefix as integer fails
1539 self._download_n_results(query, 1)
1542 def _download_n_results(self, query, n):
1543 """Downloads a specified number of results for a query"""
1549 self.report_download_page(query, pagenum)
1550 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1551 request = compat_urllib_request.Request(result_url)
1553 page = compat_urllib_request.urlopen(request).read()
1554 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1555 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1558 # Extract video identifiers
1559 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1560 video_id = mobj.group(1)
1561 if video_id not in video_ids:
1562 video_ids.append(video_id)
1563 if len(video_ids) == n:
1564 # Specified n videos reached
1565 for id in video_ids:
1566 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1569 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1570 for id in video_ids:
1571 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1574 pagenum = pagenum + 1
1577 class YahooSearchIE(InfoExtractor):
1578 """Information Extractor for Yahoo! Video search queries."""
1581 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1582 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1583 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1584 _MORE_PAGES_INDICATOR = r'\s*Next'
1585 _max_yahoo_results = 1000
1586 IE_NAME = u'video.yahoo:search'
1588 def report_download_page(self, query, pagenum):
1589 """Report attempt to download playlist page with given number."""
1590 query = query.decode(preferredencoding())
1591 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1593 def _real_extract(self, query):
1594 mobj = re.match(self._VALID_URL, query)
1596 self._downloader.report_error(u'invalid search query "%s"' % query)
1599 prefix, query = query.split(':')
1601 query = query.encode('utf-8')
1603 self._download_n_results(query, 1)
1605 elif prefix == 'all':
1606 self._download_n_results(query, self._max_yahoo_results)
1612 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1614 elif n > self._max_yahoo_results:
1615 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1616 n = self._max_yahoo_results
1617 self._download_n_results(query, n)
1619 except ValueError: # parsing prefix as integer fails
1620 self._download_n_results(query, 1)
1623 def _download_n_results(self, query, n):
1624 """Downloads a specified number of results for a query"""
1627 already_seen = set()
1631 self.report_download_page(query, pagenum)
1632 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1633 request = compat_urllib_request.Request(result_url)
1635 page = compat_urllib_request.urlopen(request).read()
1636 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1637 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1640 # Extract video identifiers
1641 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1642 video_id = mobj.group(1)
1643 if video_id not in already_seen:
1644 video_ids.append(video_id)
1645 already_seen.add(video_id)
1646 if len(video_ids) == n:
1647 # Specified n videos reached
1648 for id in video_ids:
1649 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1652 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1653 for id in video_ids:
1654 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1657 pagenum = pagenum + 1
1660 class YoutubePlaylistIE(InfoExtractor):
1661 """Information Extractor for YouTube playlists."""
1663 _VALID_URL = r"""(?:
1668 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1669 \? (?:.*?&)*? (?:p|a|list)=
1672 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1675 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1677 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1679 IE_NAME = u'youtube:playlist'
1682 def suitable(cls, url):
1683 """Receives a URL and returns True if suitable for this IE."""
1684 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1686 def report_download_page(self, playlist_id, pagenum):
1687 """Report attempt to download playlist page with given number."""
1688 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1690 def _real_extract(self, url):
1691 # Extract playlist id
1692 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1694 self._downloader.report_error(u'invalid url: %s' % url)
1697 # Download playlist videos from API
1698 playlist_id = mobj.group(1) or mobj.group(2)
1703 self.report_download_page(playlist_id, page_num)
1705 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1707 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1708 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1709 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1713 response = json.loads(page)
1714 except ValueError as err:
1715 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1718 if 'feed' not in response:
1719 self._downloader.report_error(u'Got a malformed response from YouTube API')
1721 playlist_title = response['feed']['title']['$t']
1722 if 'entry' not in response['feed']:
1723 # Number of videos is a multiple of self._MAX_RESULTS
1726 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1727 for entry in response['feed']['entry']
1728 if 'content' in entry ]
1730 if len(response['feed']['entry']) < self._MAX_RESULTS:
1734 videos = [v[1] for v in sorted(videos)]
1736 url_results = [self.url_result(url, 'Youtube') for url in videos]
1737 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1740 class YoutubeChannelIE(InfoExtractor):
1741 """Information Extractor for YouTube channels."""
1743 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1744 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1745 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1746 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1747 IE_NAME = u'youtube:channel'
1749 def report_download_page(self, channel_id, pagenum):
1750 """Report attempt to download channel page with given number."""
1751 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1753 def extract_videos_from_page(self, page):
1755 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1756 if mobj.group(1) not in ids_in_page:
1757 ids_in_page.append(mobj.group(1))
1760 def _real_extract(self, url):
1761 # Extract channel id
1762 mobj = re.match(self._VALID_URL, url)
1764 self._downloader.report_error(u'invalid url: %s' % url)
1767 # Download channel page
1768 channel_id = mobj.group(1)
1772 self.report_download_page(channel_id, pagenum)
1773 url = self._TEMPLATE_URL % (channel_id, pagenum)
1774 request = compat_urllib_request.Request(url)
1776 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1777 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1778 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1781 # Extract video identifiers
1782 ids_in_page = self.extract_videos_from_page(page)
1783 video_ids.extend(ids_in_page)
1785 # Download any subsequent channel pages using the json-based channel_ajax query
1786 if self._MORE_PAGES_INDICATOR in page:
1788 pagenum = pagenum + 1
1790 self.report_download_page(channel_id, pagenum)
1791 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1792 request = compat_urllib_request.Request(url)
1794 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1795 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1796 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1799 page = json.loads(page)
1801 ids_in_page = self.extract_videos_from_page(page['content_html'])
1802 video_ids.extend(ids_in_page)
1804 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1807 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1809 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1810 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1811 return [self.playlist_result(url_entries, channel_id)]
1814 class YoutubeUserIE(InfoExtractor):
1815 """Information Extractor for YouTube users."""
1817 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1818 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1819 _GDATA_PAGE_SIZE = 50
1820 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1821 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1822 IE_NAME = u'youtube:user'
1824 def report_download_page(self, username, start_index):
1825 """Report attempt to download user page."""
1826 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1827 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1829 def _real_extract(self, url):
1831 mobj = re.match(self._VALID_URL, url)
1833 self._downloader.report_error(u'invalid url: %s' % url)
1836 username = mobj.group(1)
1838 # Download video ids using YouTube Data API. Result size per
1839 # query is limited (currently to 50 videos) so we need to query
1840 # page by page until there are no video ids - it means we got
1847 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1848 self.report_download_page(username, start_index)
1850 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1853 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1854 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1855 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1858 # Extract video identifiers
1861 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1862 if mobj.group(1) not in ids_in_page:
1863 ids_in_page.append(mobj.group(1))
1865 video_ids.extend(ids_in_page)
1867 # A little optimization - if current page is not
1868 # "full", ie. does not contain PAGE_SIZE video ids then
1869 # we can assume that this page is the last one - there
1870 # are no more ids on further pages - no need to query
1873 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1878 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1879 url_results = [self.url_result(url, 'Youtube') for url in urls]
1880 return [self.playlist_result(url_results, playlist_title = username)]
1883 class BlipTVUserIE(InfoExtractor):
1884 """Information Extractor for blip.tv users."""
1886 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1888 IE_NAME = u'blip.tv:user'
1890 def report_download_page(self, username, pagenum):
1891 """Report attempt to download user page."""
1892 self.to_screen(u'user %s: Downloading video ids from page %d' %
1893 (username, pagenum))
1895 def _real_extract(self, url):
1897 mobj = re.match(self._VALID_URL, url)
1899 self._downloader.report_error(u'invalid url: %s' % url)
1902 username = mobj.group(1)
1904 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1906 request = compat_urllib_request.Request(url)
1909 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1910 mobj = re.search(r'data-users-id="([^"]+)"', page)
1911 page_base = page_base % mobj.group(1)
1912 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1913 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1917 # Download video ids using BlipTV Ajax calls. Result size per
1918 # query is limited (currently to 12 videos) so we need to query
1919 # page by page until there are no video ids - it means we got
1926 self.report_download_page(username, pagenum)
1927 url = page_base + "&page=" + str(pagenum)
1928 request = compat_urllib_request.Request( url )
1930 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1931 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1932 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
1935 # Extract video identifiers
1938 for mobj in re.finditer(r'href="/([^"]+)"', page):
1939 if mobj.group(1) not in ids_in_page:
1940 ids_in_page.append(unescapeHTML(mobj.group(1)))
1942 video_ids.extend(ids_in_page)
1944 # A little optimization - if current page is not
1945 # "full", ie. does not contain PAGE_SIZE video ids then
1946 # we can assume that this page is the last one - there
1947 # are no more ids on further pages - no need to query
1950 if len(ids_in_page) < self._PAGE_SIZE:
1955 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1956 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1957 return [self.playlist_result(url_entries, playlist_title = username)]
1960 class DepositFilesIE(InfoExtractor):
1961 """Information extractor for depositfiles.com"""
1963 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1965 def _real_extract(self, url):
1966 file_id = url.split('/')[-1]
1967 # Rebuild url in english locale
1968 url = 'http://depositfiles.com/en/files/' + file_id
1970 # Retrieve file webpage with 'Free download' button pressed
1971 free_download_indication = { 'gateway_result' : '1' }
1972 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1974 self.report_download_webpage(file_id)
1975 webpage = compat_urllib_request.urlopen(request).read()
1976 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1977 self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
1980 # Search for the real file URL
1981 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1982 if (mobj is None) or (mobj.group(1) is None):
1983 # Try to figure out reason of the error.
1984 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1985 if (mobj is not None) and (mobj.group(1) is not None):
1986 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1987 self._downloader.report_error(u'%s' % restriction_message)
1989 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
1992 file_url = mobj.group(1)
1993 file_extension = os.path.splitext(file_url)[1][1:]
1995 # Search for file title
1996 mobj = re.search(r'<b title="(.*?)">', webpage)
1998 self._downloader.report_error(u'unable to extract title')
2000 file_title = mobj.group(1).decode('utf-8')
2003 'id': file_id.decode('utf-8'),
2004 'url': file_url.decode('utf-8'),
2006 'upload_date': None,
2007 'title': file_title,
2008 'ext': file_extension.decode('utf-8'),
2012 class FacebookIE(InfoExtractor):
2013 """Information Extractor for Facebook"""
2015 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2016 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2017 _NETRC_MACHINE = 'facebook'
2018 IE_NAME = u'facebook'
2020 def report_login(self):
2021 """Report attempt to log in."""
2022 self.to_screen(u'Logging in')
2024 def _real_initialize(self):
2025 if self._downloader is None:
2030 downloader_params = self._downloader.params
2032 # Attempt to use provided username and password or .netrc data
2033 if downloader_params.get('username', None) is not None:
2034 useremail = downloader_params['username']
2035 password = downloader_params['password']
2036 elif downloader_params.get('usenetrc', False):
2038 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2039 if info is not None:
2043 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2044 except (IOError, netrc.NetrcParseError) as err:
2045 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2048 if useremail is None:
2057 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2060 login_results = compat_urllib_request.urlopen(request).read()
2061 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2062 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2064 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2065 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2068 def _real_extract(self, url):
2069 mobj = re.match(self._VALID_URL, url)
2071 self._downloader.report_error(u'invalid URL: %s' % url)
2073 video_id = mobj.group('ID')
2075 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2076 webpage = self._download_webpage(url, video_id)
2078 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2079 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2080 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2082 raise ExtractorError(u'Cannot parse data')
2083 data = dict(json.loads(m.group(1)))
2084 params_raw = compat_urllib_parse.unquote(data['params'])
2085 params = json.loads(params_raw)
2086 video_data = params['video_data'][0]
2087 video_url = video_data.get('hd_src')
2089 video_url = video_data['sd_src']
2091 raise ExtractorError(u'Cannot find video URL')
2092 video_duration = int(video_data['video_duration'])
2093 thumbnail = video_data['thumbnail_src']
2095 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2097 raise ExtractorError(u'Cannot find title in webpage')
2098 video_title = unescapeHTML(m.group(1))
2102 'title': video_title,
2105 'duration': video_duration,
2106 'thumbnail': thumbnail,
2111 class BlipTVIE(InfoExtractor):
2112 """Information extractor for blip.tv"""
2114 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2115 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2116 IE_NAME = u'blip.tv'
2118 def report_direct_download(self, title):
2119 """Report information extraction."""
2120 self.to_screen(u'%s: Direct download detected' % title)
2122 def _real_extract(self, url):
2123 mobj = re.match(self._VALID_URL, url)
2125 self._downloader.report_error(u'invalid URL: %s' % url)
2128 urlp = compat_urllib_parse_urlparse(url)
2129 if urlp.path.startswith('/play/'):
2130 request = compat_urllib_request.Request(url)
2131 response = compat_urllib_request.urlopen(request)
2132 redirecturl = response.geturl()
2133 rurlp = compat_urllib_parse_urlparse(redirecturl)
2134 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2135 url = 'http://blip.tv/a/a-' + file_id
2136 return self._real_extract(url)
2143 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2144 request = compat_urllib_request.Request(json_url)
2145 request.add_header('User-Agent', 'iTunes/10.6.1')
2146 self.report_extraction(mobj.group(1))
2149 urlh = compat_urllib_request.urlopen(request)
2150 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2151 basename = url.split('/')[-1]
2152 title,ext = os.path.splitext(basename)
2153 title = title.decode('UTF-8')
2154 ext = ext.replace('.', '')
2155 self.report_direct_download(title)
2160 'upload_date': None,
2165 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2166 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2167 if info is None: # Regular URL
2169 json_code_bytes = urlh.read()
2170 json_code = json_code_bytes.decode('utf-8')
2171 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2172 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2176 json_data = json.loads(json_code)
2177 if 'Post' in json_data:
2178 data = json_data['Post']
2182 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2183 video_url = data['media']['url']
2184 umobj = re.match(self._URL_EXT, video_url)
2186 raise ValueError('Can not determine filename extension')
2187 ext = umobj.group(1)
2190 'id': data['item_id'],
2192 'uploader': data['display_name'],
2193 'upload_date': upload_date,
2194 'title': data['title'],
2196 'format': data['media']['mimeType'],
2197 'thumbnail': data['thumbnailUrl'],
2198 'description': data['description'],
2199 'player_url': data['embedUrl'],
2200 'user_agent': 'iTunes/10.6.1',
2202 except (ValueError,KeyError) as err:
2203 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2209 class MyVideoIE(InfoExtractor):
2210 """Information Extractor for myvideo.de."""
2212 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2213 IE_NAME = u'myvideo'
2215 def _real_extract(self,url):
2216 mobj = re.match(self._VALID_URL, url)
2218 self._download.report_error(u'invalid URL: %s' % url)
2221 video_id = mobj.group(1)
2224 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2225 webpage = self._download_webpage(webpage_url, video_id)
2227 self.report_extraction(video_id)
2228 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2231 self._downloader.report_error(u'unable to extract media URL')
2233 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2235 mobj = re.search('<title>([^<]+)</title>', webpage)
2237 self._downloader.report_error(u'unable to extract title')
2240 video_title = mobj.group(1)
2246 'upload_date': None,
2247 'title': video_title,
2251 class ComedyCentralIE(InfoExtractor):
2252 """Information extractor for The Daily Show and Colbert Report """
2254 # urls can be abbreviations like :thedailyshow or :colbert
2255 # urls for episodes like:
2256 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2257 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2258 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2259 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2260 |(https?://)?(www\.)?
2261 (?P<showname>thedailyshow|colbertnation)\.com/
2262 (full-episodes/(?P<episode>.*)|
2264 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2265 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2268 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2270 _video_extensions = {
2278 _video_dimensions = {
2288 def suitable(cls, url):
2289 """Receives a URL and returns True if suitable for this IE."""
2290 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2292 def report_config_download(self, episode_id, media_id):
2293 self.to_screen(u'%s: Downloading configuration for %s' % (episode_id, media_id))
2295 def report_index_download(self, episode_id):
2296 self.to_screen(u'%s: Downloading show index' % episode_id)
2298 def _print_formats(self, formats):
2299 print('Available formats:')
2301 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2304 def _real_extract(self, url):
2305 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2307 self._downloader.report_error(u'invalid URL: %s' % url)
2310 if mobj.group('shortname'):
2311 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2312 url = u'http://www.thedailyshow.com/full-episodes/'
2314 url = u'http://www.colbertnation.com/full-episodes/'
2315 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2316 assert mobj is not None
2318 if mobj.group('clip'):
2319 if mobj.group('showname') == 'thedailyshow':
2320 epTitle = mobj.group('tdstitle')
2322 epTitle = mobj.group('cntitle')
2325 dlNewest = not mobj.group('episode')
2327 epTitle = mobj.group('showname')
2329 epTitle = mobj.group('episode')
2331 req = compat_urllib_request.Request(url)
2332 self.report_extraction(epTitle)
2334 htmlHandle = compat_urllib_request.urlopen(req)
2335 html = htmlHandle.read()
2336 webpage = html.decode('utf-8')
2337 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2338 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2341 url = htmlHandle.geturl()
2342 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2344 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2346 if mobj.group('episode') == '':
2347 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2349 epTitle = mobj.group('episode')
2351 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2353 if len(mMovieParams) == 0:
2354 # The Colbert Report embeds the information in a without
2355 # a URL prefix; so extract the alternate reference
2356 # and then add the URL prefix manually.
2358 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2359 if len(altMovieParams) == 0:
2360 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2363 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2365 uri = mMovieParams[0][1]
2366 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2367 self.report_index_download(epTitle)
2369 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2370 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2371 self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2376 idoc = xml.etree.ElementTree.fromstring(indexXml)
2377 itemEls = idoc.findall('.//item')
2378 for partNum,itemEl in enumerate(itemEls):
2379 mediaId = itemEl.findall('./guid')[0].text
2380 shortMediaId = mediaId.split(':')[-1]
2381 showId = mediaId.split(':')[-2].replace('.com', '')
2382 officialTitle = itemEl.findall('./title')[0].text
2383 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2385 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2386 compat_urllib_parse.urlencode({'uri': mediaId}))
2387 configReq = compat_urllib_request.Request(configUrl)
2388 self.report_config_download(epTitle, shortMediaId)
2390 configXml = compat_urllib_request.urlopen(configReq).read()
2391 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2392 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2395 cdoc = xml.etree.ElementTree.fromstring(configXml)
2397 for rendition in cdoc.findall('.//rendition'):
2398 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2402 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2405 if self._downloader.params.get('listformats', None):
2406 self._print_formats([i[0] for i in turls])
2409 # For now, just pick the highest bitrate
2410 format,rtmp_video_url = turls[-1]
2412 # Get the format arg from the arg stream
2413 req_format = self._downloader.params.get('format', None)
2415 # Select format if we can find one
2418 format, rtmp_video_url = f, v
2421 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2423 raise ExtractorError(u'Cannot transform RTMP url')
2424 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2425 video_url = base + m.group('finalid')
2427 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2432 'upload_date': officialDate,
2437 'description': officialTitle,
2439 results.append(info)
2444 class EscapistIE(InfoExtractor):
2445 """Information extractor for The Escapist """
2447 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2448 IE_NAME = u'escapist'
2450 def report_config_download(self, showName):
2451 self.to_screen(u'%s: Downloading configuration' % showName)
2453 def _real_extract(self, url):
2454 mobj = re.match(self._VALID_URL, url)
2456 self._downloader.report_error(u'invalid URL: %s' % url)
2458 showName = mobj.group('showname')
2459 videoId = mobj.group('episode')
2461 self.report_extraction(showName)
2463 webPage = compat_urllib_request.urlopen(url)
2464 webPageBytes = webPage.read()
2465 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2466 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2467 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2468 self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2471 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2472 description = unescapeHTML(descMatch.group(1))
2473 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2474 imgUrl = unescapeHTML(imgMatch.group(1))
2475 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2476 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2477 configUrlMatch = re.search('config=(.*)$', playerUrl)
2478 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2480 self.report_config_download(showName)
2482 configJSON = compat_urllib_request.urlopen(configUrl)
2483 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2484 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2485 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2486 self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2489 # Technically, it's JavaScript, not JSON
2490 configJSON = configJSON.replace("'", '"')
2493 config = json.loads(configJSON)
2494 except (ValueError,) as err:
2495 self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2498 playlist = config['playlist']
2499 videoUrl = playlist[1]['url']
2504 'uploader': showName,
2505 'upload_date': None,
2508 'thumbnail': imgUrl,
2509 'description': description,
2510 'player_url': playerUrl,
2515 class CollegeHumorIE(InfoExtractor):
2516 """Information extractor for collegehumor.com"""
2519 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2520 IE_NAME = u'collegehumor'
2522 def report_manifest(self, video_id):
2523 """Report information extraction."""
2524 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2526 def _real_extract(self, url):
2527 mobj = re.match(self._VALID_URL, url)
2529 self._downloader.report_error(u'invalid URL: %s' % url)
2531 video_id = mobj.group('videoid')
2536 'upload_date': None,
2539 self.report_extraction(video_id)
2540 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2542 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2543 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2544 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2547 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2549 videoNode = mdoc.findall('./video')[0]
2550 info['description'] = videoNode.findall('./description')[0].text
2551 info['title'] = videoNode.findall('./caption')[0].text
2552 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2553 manifest_url = videoNode.findall('./file')[0].text
2555 self._downloader.report_error(u'Invalid metadata XML file')
2558 manifest_url += '?hdcore=2.10.3'
2559 self.report_manifest(video_id)
2561 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2562 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2563 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2566 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2568 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2569 node_id = media_node.attrib['url']
2570 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2571 except IndexError as err:
2572 self._downloader.report_error(u'Invalid manifest file')
2575 url_pr = compat_urllib_parse_urlparse(manifest_url)
2576 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2583 class XVideosIE(InfoExtractor):
2584 """Information extractor for xvideos.com"""
2586 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2587 IE_NAME = u'xvideos'
2589 def _real_extract(self, url):
2590 mobj = re.match(self._VALID_URL, url)
2592 self._downloader.report_error(u'invalid URL: %s' % url)
2594 video_id = mobj.group(1)
2596 webpage = self._download_webpage(url, video_id)
2598 self.report_extraction(video_id)
2602 mobj = re.search(r'flv_url=(.+?)&', webpage)
2604 self._downloader.report_error(u'unable to extract video url')
2606 video_url = compat_urllib_parse.unquote(mobj.group(1))
2610 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2612 self._downloader.report_error(u'unable to extract video title')
2614 video_title = mobj.group(1)
2617 # Extract video thumbnail
2618 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2620 self._downloader.report_error(u'unable to extract video thumbnail')
2622 video_thumbnail = mobj.group(0)
2628 'upload_date': None,
2629 'title': video_title,
2631 'thumbnail': video_thumbnail,
2632 'description': None,
2638 class SoundcloudIE(InfoExtractor):
2639 """Information extractor for soundcloud.com
2640 To access the media, the uid of the song and a stream token
2641 must be extracted from the page source and the script must make
2642 a request to media.soundcloud.com/crossdomain.xml. Then
2643 the media can be grabbed by requesting from an url composed
2644 of the stream token and uid
2647 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2648 IE_NAME = u'soundcloud'
2650 def report_resolve(self, video_id):
2651 """Report information extraction."""
2652 self.to_screen(u'%s: Resolving id' % video_id)
2654 def _real_extract(self, url):
2655 mobj = re.match(self._VALID_URL, url)
2657 self._downloader.report_error(u'invalid URL: %s' % url)
2660 # extract uploader (which is in the url)
2661 uploader = mobj.group(1)
2662 # extract simple title (uploader + slug of song title)
2663 slug_title = mobj.group(2)
2664 simple_title = uploader + u'-' + slug_title
2666 self.report_resolve('%s/%s' % (uploader, slug_title))
2668 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2669 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2670 request = compat_urllib_request.Request(resolv_url)
2672 info_json_bytes = compat_urllib_request.urlopen(request).read()
2673 info_json = info_json_bytes.decode('utf-8')
2674 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2675 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2678 info = json.loads(info_json)
2679 video_id = info['id']
2680 self.report_extraction('%s/%s' % (uploader, slug_title))
2682 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2683 request = compat_urllib_request.Request(streams_url)
2685 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2686 stream_json = stream_json_bytes.decode('utf-8')
2687 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2688 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2691 streams = json.loads(stream_json)
2692 mediaURL = streams['http_mp3_128_url']
2693 upload_date = unified_strdate(info['created_at'])
2698 'uploader': info['user']['username'],
2699 'upload_date': upload_date,
2700 'title': info['title'],
2702 'description': info['description'],
2705 class SoundcloudSetIE(InfoExtractor):
2706 """Information extractor for soundcloud.com sets
2707 To access the media, the uid of the song and a stream token
2708 must be extracted from the page source and the script must make
2709 a request to media.soundcloud.com/crossdomain.xml. Then
2710 the media can be grabbed by requesting from an url composed
2711 of the stream token and uid
2714 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2715 IE_NAME = u'soundcloud'
2717 def report_resolve(self, video_id):
2718 """Report information extraction."""
2719 self.to_screen(u'%s: Resolving id' % video_id)
2721 def _real_extract(self, url):
2722 mobj = re.match(self._VALID_URL, url)
2724 self._downloader.report_error(u'invalid URL: %s' % url)
2727 # extract uploader (which is in the url)
2728 uploader = mobj.group(1)
2729 # extract simple title (uploader + slug of song title)
2730 slug_title = mobj.group(2)
2731 simple_title = uploader + u'-' + slug_title
2733 self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2735 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2736 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2737 request = compat_urllib_request.Request(resolv_url)
2739 info_json_bytes = compat_urllib_request.urlopen(request).read()
2740 info_json = info_json_bytes.decode('utf-8')
2741 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2742 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2746 info = json.loads(info_json)
2747 if 'errors' in info:
2748 for err in info['errors']:
2749 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2752 for track in info['tracks']:
2753 video_id = track['id']
2754 self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2756 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2757 request = compat_urllib_request.Request(streams_url)
2759 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2760 stream_json = stream_json_bytes.decode('utf-8')
2761 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2762 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2765 streams = json.loads(stream_json)
2766 mediaURL = streams['http_mp3_128_url']
2771 'uploader': track['user']['username'],
2772 'upload_date': track['created_at'],
2773 'title': track['title'],
2775 'description': track['description'],
2780 class InfoQIE(InfoExtractor):
2781 """Information extractor for infoq.com"""
2782 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2784 def _real_extract(self, url):
2785 mobj = re.match(self._VALID_URL, url)
2787 self._downloader.report_error(u'invalid URL: %s' % url)
2790 webpage = self._download_webpage(url, video_id=url)
2791 self.report_extraction(url)
2794 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2796 self._downloader.report_error(u'unable to extract video url')
2798 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2799 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2802 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2804 self._downloader.report_error(u'unable to extract video title')
2806 video_title = mobj.group(1)
2808 # Extract description
2809 video_description = u'No description available.'
2810 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2811 if mobj is not None:
2812 video_description = mobj.group(1)
2814 video_filename = video_url.split('/')[-1]
2815 video_id, extension = video_filename.split('.')
2821 'upload_date': None,
2822 'title': video_title,
2823 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2825 'description': video_description,
2830 class MixcloudIE(InfoExtractor):
2831 """Information extractor for www.mixcloud.com"""
2833 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2834 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2835 IE_NAME = u'mixcloud'
2837 def report_download_json(self, file_id):
2838 """Report JSON download."""
2839 self.to_screen(u'Downloading json')
2841 def get_urls(self, jsonData, fmt, bitrate='best'):
2842 """Get urls from 'audio_formats' section in json"""
2845 bitrate_list = jsonData[fmt]
2846 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2847 bitrate = max(bitrate_list) # select highest
2849 url_list = jsonData[fmt][bitrate]
2850 except TypeError: # we have no bitrate info.
2851 url_list = jsonData[fmt]
2854 def check_urls(self, url_list):
2855 """Returns 1st active url from list"""
2856 for url in url_list:
2858 compat_urllib_request.urlopen(url)
2860 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2865 def _print_formats(self, formats):
2866 print('Available formats:')
2867 for fmt in formats.keys():
2868 for b in formats[fmt]:
2870 ext = formats[fmt][b][0]
2871 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2872 except TypeError: # we have no bitrate info
2873 ext = formats[fmt][0]
2874 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2877 def _real_extract(self, url):
2878 mobj = re.match(self._VALID_URL, url)
2880 self._downloader.report_error(u'invalid URL: %s' % url)
2882 # extract uploader & filename from url
2883 uploader = mobj.group(1).decode('utf-8')
2884 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2886 # construct API request
2887 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2888 # retrieve .json file with links to files
2889 request = compat_urllib_request.Request(file_url)
2891 self.report_download_json(file_url)
2892 jsonData = compat_urllib_request.urlopen(request).read()
2893 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2894 self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
2898 json_data = json.loads(jsonData)
2899 player_url = json_data['player_swf_url']
2900 formats = dict(json_data['audio_formats'])
2902 req_format = self._downloader.params.get('format', None)
2905 if self._downloader.params.get('listformats', None):
2906 self._print_formats(formats)
2909 if req_format is None or req_format == 'best':
2910 for format_param in formats.keys():
2911 url_list = self.get_urls(formats, format_param)
2913 file_url = self.check_urls(url_list)
2914 if file_url is not None:
2917 if req_format not in formats:
2918 self._downloader.report_error(u'format is not available')
2921 url_list = self.get_urls(formats, req_format)
2922 file_url = self.check_urls(url_list)
2923 format_param = req_format
2926 'id': file_id.decode('utf-8'),
2927 'url': file_url.decode('utf-8'),
2928 'uploader': uploader.decode('utf-8'),
2929 'upload_date': None,
2930 'title': json_data['name'],
2931 'ext': file_url.split('.')[-1].decode('utf-8'),
2932 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2933 'thumbnail': json_data['thumbnail_url'],
2934 'description': json_data['description'],
2935 'player_url': player_url.decode('utf-8'),
2938 class StanfordOpenClassroomIE(InfoExtractor):
2939 """Information extractor for Stanford's Open ClassRoom"""
2941 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2942 IE_NAME = u'stanfordoc'
2944 def _real_extract(self, url):
2945 mobj = re.match(self._VALID_URL, url)
2947 raise ExtractorError(u'Invalid URL: %s' % url)
2949 if mobj.group('course') and mobj.group('video'): # A specific video
2950 course = mobj.group('course')
2951 video = mobj.group('video')
2953 'id': course + '_' + video,
2955 'upload_date': None,
2958 self.report_extraction(info['id'])
2959 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2960 xmlUrl = baseUrl + video + '.xml'
2962 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2963 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2964 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2966 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2968 info['title'] = mdoc.findall('./title')[0].text
2969 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2971 self._downloader.report_error(u'Invalid metadata XML file')
2973 info['ext'] = info['url'].rpartition('.')[2]
2975 elif mobj.group('course'): # A course page
2976 course = mobj.group('course')
2981 'upload_date': None,
2984 coursepage = self._download_webpage(url, info['id'],
2985 note='Downloading course info page',
2986 errnote='Unable to download course info page')
2988 m = re.search('<h1>([^<]+)</h1>', coursepage)
2990 info['title'] = unescapeHTML(m.group(1))
2992 info['title'] = info['id']
2994 m = re.search('<description>([^<]+)</description>', coursepage)
2996 info['description'] = unescapeHTML(m.group(1))
2998 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3001 'type': 'reference',
3002 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3006 for entry in info['list']:
3007 assert entry['type'] == 'reference'
3008 results += self.extract(entry['url'])
3012 'id': 'Stanford OpenClassroom',
3015 'upload_date': None,
3018 self.report_download_webpage(info['id'])
3019 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3021 rootpage = compat_urllib_request.urlopen(rootURL).read()
3022 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3023 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3026 info['title'] = info['id']
3028 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3031 'type': 'reference',
3032 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3037 for entry in info['list']:
3038 assert entry['type'] == 'reference'
3039 results += self.extract(entry['url'])
3042 class MTVIE(InfoExtractor):
3043 """Information extractor for MTV.com"""
3045 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3048 def _real_extract(self, url):
3049 mobj = re.match(self._VALID_URL, url)
3051 self._downloader.report_error(u'invalid URL: %s' % url)
3053 if not mobj.group('proto'):
3054 url = 'http://' + url
3055 video_id = mobj.group('videoid')
3057 webpage = self._download_webpage(url, video_id)
3059 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3061 self._downloader.report_error(u'unable to extract song name')
3063 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3064 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3066 self._downloader.report_error(u'unable to extract performer')
3068 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3069 video_title = performer + ' - ' + song_name
3071 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3073 self._downloader.report_error(u'unable to mtvn_uri')
3075 mtvn_uri = mobj.group(1)
3077 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3079 self._downloader.report_error(u'unable to extract content id')
3081 content_id = mobj.group(1)
3083 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3084 self.report_extraction(video_id)
3085 request = compat_urllib_request.Request(videogen_url)
3087 metadataXml = compat_urllib_request.urlopen(request).read()
3088 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3089 self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3092 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3093 renditions = mdoc.findall('.//rendition')
3095 # For now, always pick the highest quality.
3096 rendition = renditions[-1]
3099 _,_,ext = rendition.attrib['type'].partition('/')
3100 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3101 video_url = rendition.find('./src').text
3103 self._downloader.report_error('Invalid rendition field.')
3109 'uploader': performer,
3110 'upload_date': None,
3111 'title': video_title,
3119 class YoukuIE(InfoExtractor):
3120 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3123 nowTime = int(time.time() * 1000)
3124 random1 = random.randint(1000,1998)
3125 random2 = random.randint(1000,9999)
3127 return "%d%d%d" %(nowTime,random1,random2)
3129 def _get_file_ID_mix_string(self, seed):
3131 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3133 for i in range(len(source)):
3134 seed = (seed * 211 + 30031 ) % 65536
3135 index = math.floor(seed / 65536 * len(source) )
3136 mixed.append(source[int(index)])
3137 source.remove(source[int(index)])
3138 #return ''.join(mixed)
3141 def _get_file_id(self, fileId, seed):
3142 mixed = self._get_file_ID_mix_string(seed)
3143 ids = fileId.split('*')
3147 realId.append(mixed[int(ch)])
3148 return ''.join(realId)
3150 def _real_extract(self, url):
3151 mobj = re.match(self._VALID_URL, url)
3153 self._downloader.report_error(u'invalid URL: %s' % url)
3155 video_id = mobj.group('ID')
3157 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3159 request = compat_urllib_request.Request(info_url, None, std_headers)
3161 self.report_download_webpage(video_id)
3162 jsondata = compat_urllib_request.urlopen(request).read()
3163 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3164 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3167 self.report_extraction(video_id)
3169 jsonstr = jsondata.decode('utf-8')
3170 config = json.loads(jsonstr)
3172 video_title = config['data'][0]['title']
3173 seed = config['data'][0]['seed']
3175 format = self._downloader.params.get('format', None)
3176 supported_format = list(config['data'][0]['streamfileids'].keys())
3178 if format is None or format == 'best':
3179 if 'hd2' in supported_format:
3184 elif format == 'worst':
3192 fileid = config['data'][0]['streamfileids'][format]
3193 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3194 except (UnicodeDecodeError, ValueError, KeyError):
3195 self._downloader.report_error(u'unable to extract info section')
3199 sid = self._gen_sid()
3200 fileid = self._get_file_id(fileid, seed)
3202 #column 8,9 of fileid represent the segment number
3203 #fileid[7:9] should be changed
3204 for index, key in enumerate(keys):
3206 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3207 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3210 'id': '%s_part%02d' % (video_id, index),
3211 'url': download_url,
3213 'upload_date': None,
3214 'title': video_title,
3217 files_info.append(info)
3222 class XNXXIE(InfoExtractor):
3223 """Information extractor for xnxx.com"""
3225 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3227 VIDEO_URL_RE = r'flv_url=(.*?)&'
3228 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3229 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3231 def _real_extract(self, url):
3232 mobj = re.match(self._VALID_URL, url)
3234 self._downloader.report_error(u'invalid URL: %s' % url)
3236 video_id = mobj.group(1)
3238 self.report_download_webpage(video_id)
3240 # Get webpage content
3242 webpage_bytes = compat_urllib_request.urlopen(url).read()
3243 webpage = webpage_bytes.decode('utf-8')
3244 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3245 self._downloader.report_error(u'unable to download video webpage: %s' % err)
3248 result = re.search(self.VIDEO_URL_RE, webpage)
3250 self._downloader.report_error(u'unable to extract video url')
3252 video_url = compat_urllib_parse.unquote(result.group(1))
3254 result = re.search(self.VIDEO_TITLE_RE, webpage)
3256 self._downloader.report_error(u'unable to extract video title')
3258 video_title = result.group(1)
3260 result = re.search(self.VIDEO_THUMB_RE, webpage)
3262 self._downloader.report_error(u'unable to extract video thumbnail')
3264 video_thumbnail = result.group(1)
3270 'upload_date': None,
3271 'title': video_title,
3273 'thumbnail': video_thumbnail,
3274 'description': None,
3278 class GooglePlusIE(InfoExtractor):
3279 """Information extractor for plus.google.com."""
3281 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3282 IE_NAME = u'plus.google'
3284 def report_extract_entry(self, url):
3285 """Report downloading extry"""
3286 self.to_screen(u'Downloading entry: %s' % url)
3288 def report_date(self, upload_date):
3289 """Report downloading extry"""
3290 self.to_screen(u'Entry date: %s' % upload_date)
3292 def report_uploader(self, uploader):
3293 """Report downloading extry"""
3294 self.to_screen(u'Uploader: %s' % uploader)
3296 def report_title(self, video_title):
3297 """Report downloading extry"""
3298 self.to_screen(u'Title: %s' % video_title)
3300 def report_extract_vid_page(self, video_page):
3301 """Report information extraction."""
3302 self.to_screen(u'Extracting video page: %s' % video_page)
3304 def _real_extract(self, url):
3305 # Extract id from URL
3306 mobj = re.match(self._VALID_URL, url)
3308 self._downloader.report_error(u'Invalid URL: %s' % url)
3311 post_url = mobj.group(0)
3312 video_id = mobj.group(1)
3314 video_extension = 'flv'
3316 # Step 1, Retrieve post webpage to extract further information
3317 self.report_extract_entry(post_url)
3318 request = compat_urllib_request.Request(post_url)
3320 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3321 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3322 self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3325 # Extract update date
3327 pattern = 'title="Timestamp">(.*?)</a>'
3328 mobj = re.search(pattern, webpage)
3330 upload_date = mobj.group(1)
3331 # Convert timestring to a format suitable for filename
3332 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3333 upload_date = upload_date.strftime('%Y%m%d')
3334 self.report_date(upload_date)
3338 pattern = r'rel\="author".*?>(.*?)</a>'
3339 mobj = re.search(pattern, webpage)
3341 uploader = mobj.group(1)
3342 self.report_uploader(uploader)
3345 # Get the first line for title
3347 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3348 mobj = re.search(pattern, webpage)
3350 video_title = mobj.group(1)
3351 self.report_title(video_title)
3353 # Step 2, Stimulate clicking the image box to launch video
3354 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3355 mobj = re.search(pattern, webpage)
3357 self._downloader.report_error(u'unable to extract video page URL')
3359 video_page = mobj.group(1)
3360 request = compat_urllib_request.Request(video_page)
3362 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3363 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3364 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3366 self.report_extract_vid_page(video_page)
3369 # Extract video links on video page
3370 """Extract video links of all sizes"""
3371 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3372 mobj = re.findall(pattern, webpage)
3374 self._downloader.report_error(u'unable to extract video links')
3376 # Sort in resolution
3377 links = sorted(mobj)
3379 # Choose the lowest of the sort, i.e. highest resolution
3380 video_url = links[-1]
3381 # Only get the url. The resolution part in the tuple has no use anymore
3382 video_url = video_url[-1]
3383 # Treat escaped \u0026 style hex
3385 video_url = video_url.decode("unicode_escape")
3386 except AttributeError: # Python 3
3387 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3393 'uploader': uploader,
3394 'upload_date': upload_date,
3395 'title': video_title,
3396 'ext': video_extension,
3399 class NBAIE(InfoExtractor):
3400 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3403 def _real_extract(self, url):
3404 mobj = re.match(self._VALID_URL, url)
3406 self._downloader.report_error(u'invalid URL: %s' % url)
3409 video_id = mobj.group(1)
3410 if video_id.endswith('/index.html'):
3411 video_id = video_id[:-len('/index.html')]
3413 webpage = self._download_webpage(url, video_id)
3415 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3416 def _findProp(rexp, default=None):
3417 m = re.search(rexp, webpage)
3419 return unescapeHTML(m.group(1))
3423 shortened_video_id = video_id.rpartition('/')[2]
3424 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3426 'id': shortened_video_id,
3430 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3431 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3435 class JustinTVIE(InfoExtractor):
3436 """Information extractor for justin.tv and twitch.tv"""
3437 # TODO: One broadcast may be split into multiple videos. The key
3438 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3439 # starts at 1 and increases. Can we treat all parts as one video?
3441 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3442 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3443 _JUSTIN_PAGE_LIMIT = 100
3444 IE_NAME = u'justin.tv'
3446 def report_download_page(self, channel, offset):
3447 """Report attempt to download a single page of videos."""
3448 self.to_screen(u'%s: Downloading video information from %d to %d' %
3449 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3451 # Return count of items, list of *valid* items
3452 def _parse_page(self, url):
3454 urlh = compat_urllib_request.urlopen(url)
3455 webpage_bytes = urlh.read()
3456 webpage = webpage_bytes.decode('utf-8', 'ignore')
3457 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3458 self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3461 response = json.loads(webpage)
3462 if type(response) != list:
3463 error_text = response.get('error', 'unknown error')
3464 self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3467 for clip in response:
3468 video_url = clip['video_file_url']
3470 video_extension = os.path.splitext(video_url)[1][1:]
3471 video_date = re.sub('-', '', clip['start_time'][:10])
3472 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3473 video_id = clip['id']
3474 video_title = clip.get('title', video_id)
3478 'title': video_title,
3479 'uploader': clip.get('channel_name', video_uploader_id),
3480 'uploader_id': video_uploader_id,
3481 'upload_date': video_date,
3482 'ext': video_extension,
3484 return (len(response), info)
3486 def _real_extract(self, url):
3487 mobj = re.match(self._VALID_URL, url)
3489 self._downloader.report_error(u'invalid URL: %s' % url)
3492 api = 'http://api.justin.tv'
3493 video_id = mobj.group(mobj.lastindex)
3495 if mobj.lastindex == 1:
3497 api += '/channel/archives/%s.json'
3499 api += '/broadcast/by_archive/%s.json'
3500 api = api % (video_id,)
3502 self.report_extraction(video_id)
3506 limit = self._JUSTIN_PAGE_LIMIT
3509 self.report_download_page(video_id, offset)
3510 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3511 page_count, page_info = self._parse_page(page_url)
3512 info.extend(page_info)
3513 if not paged or page_count != limit:
3518 class FunnyOrDieIE(InfoExtractor):
3519 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3521 def _real_extract(self, url):
3522 mobj = re.match(self._VALID_URL, url)
3524 self._downloader.report_error(u'invalid URL: %s' % url)
3527 video_id = mobj.group('id')
3528 webpage = self._download_webpage(url, video_id)
3530 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3532 self._downloader.report_error(u'unable to find video information')
3533 video_url = unescapeHTML(m.group('url'))
3535 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3537 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3539 self._downloader.report_error(u'Cannot find video title')
3540 title = clean_html(m.group('title'))
3542 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3544 desc = unescapeHTML(m.group('desc'))
3553 'description': desc,
3557 class SteamIE(InfoExtractor):
3558 _VALID_URL = r"""http://store.steampowered.com/
3560 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3562 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3566 def suitable(cls, url):
3567 """Receives a URL and returns True if suitable for this IE."""
3568 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3570 def _real_extract(self, url):
3571 m = re.match(self._VALID_URL, url, re.VERBOSE)
3572 gameID = m.group('gameID')
3573 videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3574 self.report_age_confirmation()
3575 webpage = self._download_webpage(videourl, gameID)
3576 game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3578 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3579 mweb = re.finditer(urlRE, webpage)
3580 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3581 titles = re.finditer(namesRE, webpage)
3582 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3583 thumbs = re.finditer(thumbsRE, webpage)
3585 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3586 video_id = vid.group('videoID')
3587 title = vtitle.group('videoName')
3588 video_url = vid.group('videoURL')
3589 video_thumb = thumb.group('thumbnail')
3591 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3596 'title': unescapeHTML(title),
3597 'thumbnail': video_thumb
3600 return [self.playlist_result(videos, gameID, game_title)]
3602 class UstreamIE(InfoExtractor):
3603 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3604 IE_NAME = u'ustream'
3606 def _real_extract(self, url):
3607 m = re.match(self._VALID_URL, url)
3608 video_id = m.group('videoID')
3609 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3610 webpage = self._download_webpage(url, video_id)
3611 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3612 title = m.group('title')
3613 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3614 uploader = m.group('uploader')
3620 'uploader': uploader
3624 class WorldStarHipHopIE(InfoExtractor):
3625 _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3626 IE_NAME = u'WorldStarHipHop'
3628 def _real_extract(self, url):
3629 _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3631 webpage_src = compat_urllib_request.urlopen(url).read()
3632 webpage_src = webpage_src.decode('utf-8')
3634 mobj = re.search(_src_url, webpage_src)
3636 m = re.match(self._VALID_URL, url)
3637 video_id = m.group('id')
3639 if mobj is not None:
3640 video_url = mobj.group()
3641 if 'mp4' in video_url:
3646 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3649 _title = r"""<title>(.*)</title>"""
3651 mobj = re.search(_title, webpage_src)
3653 if mobj is not None:
3654 title = mobj.group(1)
3656 title = 'World Start Hip Hop - %s' % time.ctime()
3658 _thumbnail = r"""rel="image_src" href="(.*)" />"""
3659 mobj = re.search(_thumbnail, webpage_src)
3661 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3662 if mobj is not None:
3663 thumbnail = mobj.group(1)
3665 _title = r"""candytitles.*>(.*)</span>"""
3666 mobj = re.search(_title, webpage_src)
3667 if mobj is not None:
3668 title = mobj.group(1)
3675 'thumbnail' : thumbnail,
3680 class RBMARadioIE(InfoExtractor):
3681 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3683 def _real_extract(self, url):
3684 m = re.match(self._VALID_URL, url)
3685 video_id = m.group('videoID')
3687 webpage = self._download_webpage(url, video_id)
3688 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3690 raise ExtractorError(u'Cannot find metadata')
3691 json_data = m.group(1)
3694 data = json.loads(json_data)
3695 except ValueError as e:
3696 raise ExtractorError(u'Invalid JSON: ' + str(e))
3698 video_url = data['akamai_url'] + '&cbr=256'
3699 url_parts = compat_urllib_parse_urlparse(video_url)
3700 video_ext = url_parts.path.rpartition('.')[2]
3705 'title': data['title'],
3706 'description': data.get('teaser_text'),
3707 'location': data.get('country_of_origin'),
3708 'uploader': data.get('host', {}).get('name'),
3709 'uploader_id': data.get('host', {}).get('slug'),
3710 'thumbnail': data.get('image', {}).get('large_url_2x'),
3711 'duration': data.get('duration'),
3716 class YouPornIE(InfoExtractor):
3717 """Information extractor for youporn.com."""
3718 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3720 def _print_formats(self, formats):
3721 """Print all available formats"""
3722 print(u'Available formats:')
3723 print(u'ext\t\tformat')
3724 print(u'---------------------------------')
3725 for format in formats:
3726 print(u'%s\t\t%s' % (format['ext'], format['format']))
3728 def _specific(self, req_format, formats):
3730 if(x["format"]==req_format):
3734 def _real_extract(self, url):
3735 mobj = re.match(self._VALID_URL, url)
3737 self._downloader.report_error(u'invalid URL: %s' % url)
3740 video_id = mobj.group('videoid')
3742 req = compat_urllib_request.Request(url)
3743 req.add_header('Cookie', 'age_verified=1')
3744 webpage = self._download_webpage(req, video_id)
3746 # Get the video title
3747 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3749 raise ExtractorError(u'Unable to extract video title')
3750 video_title = result.group('title').strip()
3752 # Get the video date
3753 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3755 self._downloader.report_warning(u'unable to extract video date')
3758 upload_date = unified_strdate(result.group('date').strip())
3760 # Get the video uploader
3761 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3763 self._downloader.report_warning(u'unable to extract uploader')
3764 video_uploader = None
3766 video_uploader = result.group('uploader').strip()
3767 video_uploader = clean_html( video_uploader )
3769 # Get all of the formats available
3770 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3771 result = re.search(DOWNLOAD_LIST_RE, webpage)
3773 raise ExtractorError(u'Unable to extract download list')
3774 download_list_html = result.group('download_list').strip()
3776 # Get all of the links from the page
3777 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3778 links = re.findall(LINK_RE, download_list_html)
3779 if(len(links) == 0):
3780 raise ExtractorError(u'ERROR: no known formats available for video')
3782 self.to_screen(u'Links found: %d' % len(links))
3787 # A link looks like this:
3788 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3789 # A path looks like this:
3790 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3791 video_url = unescapeHTML( link )
3792 path = compat_urllib_parse_urlparse( video_url ).path
3793 extension = os.path.splitext( path )[1][1:]
3794 format = path.split('/')[4].split('_')[:2]
3797 format = "-".join( format )
3798 title = u'%s-%s-%s' % (video_title, size, bitrate)
3803 'uploader': video_uploader,
3804 'upload_date': upload_date,
3809 'description': None,
3813 if self._downloader.params.get('listformats', None):
3814 self._print_formats(formats)
3817 req_format = self._downloader.params.get('format', None)
3818 self.to_screen(u'Format: %s' % req_format)
3820 if req_format is None or req_format == 'best':
3822 elif req_format == 'worst':
3823 return [formats[-1]]
3824 elif req_format in ('-1', 'all'):
3827 format = self._specific( req_format, formats )
3829 self._downloader.report_error(u'requested format not available')
3835 class PornotubeIE(InfoExtractor):
3836 """Information extractor for pornotube.com."""
3837 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3839 def _real_extract(self, url):
3840 mobj = re.match(self._VALID_URL, url)
3842 self._downloader.report_error(u'invalid URL: %s' % url)
3845 video_id = mobj.group('videoid')
3846 video_title = mobj.group('title')
3848 # Get webpage content
3849 webpage = self._download_webpage(url, video_id)
3852 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3853 result = re.search(VIDEO_URL_RE, webpage)
3855 self._downloader.report_error(u'unable to extract video url')
3857 video_url = compat_urllib_parse.unquote(result.group('url'))
3859 #Get the uploaded date
3860 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3861 result = re.search(VIDEO_UPLOADED_RE, webpage)
3863 self._downloader.report_error(u'unable to extract video title')
3865 upload_date = unified_strdate(result.group('date'))
3867 info = {'id': video_id,
3870 'upload_date': upload_date,
3871 'title': video_title,
3877 class YouJizzIE(InfoExtractor):
3878 """Information extractor for youjizz.com."""
3879 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3881 def _real_extract(self, url):
3882 mobj = re.match(self._VALID_URL, url)
3884 self._downloader.report_error(u'invalid URL: %s' % url)
3887 video_id = mobj.group('videoid')
3889 # Get webpage content
3890 webpage = self._download_webpage(url, video_id)
3892 # Get the video title
3893 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3895 raise ExtractorError(u'ERROR: unable to extract video title')
3896 video_title = result.group('title').strip()
3898 # Get the embed page
3899 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3901 raise ExtractorError(u'ERROR: unable to extract embed page')
3903 embed_page_url = result.group(0).strip()
3904 video_id = result.group('videoid')
3906 webpage = self._download_webpage(embed_page_url, video_id)
3909 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3911 raise ExtractorError(u'ERROR: unable to extract video url')
3912 video_url = result.group('source')
3914 info = {'id': video_id,
3916 'title': video_title,
3919 'player_url': embed_page_url}
3923 class EightTracksIE(InfoExtractor):
3925 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3927 def _real_extract(self, url):
3928 mobj = re.match(self._VALID_URL, url)
3930 raise ExtractorError(u'Invalid URL: %s' % url)
3931 playlist_id = mobj.group('id')
3933 webpage = self._download_webpage(url, playlist_id)
3935 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3937 raise ExtractorError(u'Cannot find trax information')
3938 json_like = m.group(1)
3939 data = json.loads(json_like)
3941 session = str(random.randint(0, 1000000000))
3943 track_count = data['tracks_count']
3944 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3945 next_url = first_url
3947 for i in itertools.count():
3948 api_json = self._download_webpage(next_url, playlist_id,
3949 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3950 errnote=u'Failed to download song information')
3951 api_data = json.loads(api_json)
3952 track_data = api_data[u'set']['track']
3954 'id': track_data['id'],
3955 'url': track_data['track_file_stream_url'],
3956 'title': track_data['performer'] + u' - ' + track_data['name'],
3957 'raw_title': track_data['name'],
3958 'uploader_id': data['user']['login'],
3962 if api_data['set']['at_last_track']:
3964 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3967 class KeekIE(InfoExtractor):
3968 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3971 def _real_extract(self, url):
3972 m = re.match(self._VALID_URL, url)
3973 video_id = m.group('videoID')
3974 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3975 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3976 webpage = self._download_webpage(url, video_id)
3977 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3978 title = unescapeHTML(m.group('title'))
3979 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3980 uploader = clean_html(m.group('uploader'))
3986 'thumbnail': thumbnail,
3987 'uploader': uploader
3991 class TEDIE(InfoExtractor):
3992 _VALID_URL=r'''http://www.ted.com/
3994 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3996 ((?P<type_talk>talks)) # We have a simple talk
3998 /(?P<name>\w+) # Here goes the name and then ".html"
4002 def suitable(cls, url):
4003 """Receives a URL and returns True if suitable for this IE."""
4004 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4006 def _real_extract(self, url):
4007 m=re.match(self._VALID_URL, url, re.VERBOSE)
4008 if m.group('type_talk'):
4009 return [self._talk_info(url)]
4011 playlist_id=m.group('playlist_id')
4012 name=m.group('name')
4013 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
4014 return [self._playlist_videos_info(url,name,playlist_id)]
4016 def _talk_video_link(self,mediaSlug):
4017 '''Returns the video link for that mediaSlug'''
4018 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4020 def _playlist_videos_info(self,url,name,playlist_id=0):
4021 '''Returns the videos of the playlist'''
4023 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4024 ([.\s]*?)data-playlist_item_id="(\d+)"
4025 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4027 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4028 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4029 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4030 m_names=re.finditer(video_name_RE,webpage)
4032 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
4033 m_playlist = re.search(playlist_RE, webpage)
4034 playlist_title = m_playlist.group('playlist_title')
4036 playlist_entries = []
4037 for m_video, m_name in zip(m_videos,m_names):
4038 video_id=m_video.group('video_id')
4039 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4040 playlist_entries.append(self.url_result(talk_url, 'TED'))
4041 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
4043 def _talk_info(self, url, video_id=0):
4044 """Return the video for the talk in the url"""
4045 m=re.match(self._VALID_URL, url,re.VERBOSE)
4046 videoName=m.group('name')
4047 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4048 # If the url includes the language we get the title translated
4049 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4050 title=re.search(title_RE, webpage).group('title')
4051 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4052 "id":(?P<videoID>[\d]+).*?
4053 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4054 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4055 thumb_match=re.search(thumb_RE,webpage)
4056 info_match=re.search(info_RE,webpage,re.VERBOSE)
4057 video_id=info_match.group('videoID')
4058 mediaSlug=info_match.group('mediaSlug')
4059 video_url=self._talk_video_link(mediaSlug)
4065 'thumbnail': thumb_match.group('thumbnail')
4069 class MySpassIE(InfoExtractor):
4070 _VALID_URL = r'http://www.myspass.de/.*'
4072 def _real_extract(self, url):
4073 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4075 # video id is the last path element of the URL
4076 # usually there is a trailing slash, so also try the second but last
4077 url_path = compat_urllib_parse_urlparse(url).path
4078 url_parent_path, video_id = os.path.split(url_path)
4080 _, video_id = os.path.split(url_parent_path)
4083 metadata_url = META_DATA_URL_TEMPLATE % video_id
4084 metadata_text = self._download_webpage(metadata_url, video_id)
4085 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4087 # extract values from metadata
4088 url_flv_el = metadata.find('url_flv')
4089 if url_flv_el is None:
4090 self._downloader.report_error(u'unable to extract download url')
4092 video_url = url_flv_el.text
4093 extension = os.path.splitext(video_url)[1][1:]
4094 title_el = metadata.find('title')
4095 if title_el is None:
4096 self._downloader.report_error(u'unable to extract title')
4098 title = title_el.text
4099 format_id_el = metadata.find('format_id')
4100 if format_id_el is None:
4103 format = format_id_el.text
4104 description_el = metadata.find('description')
4105 if description_el is not None:
4106 description = description_el.text
4109 imagePreview_el = metadata.find('imagePreview')
4110 if imagePreview_el is not None:
4111 thumbnail = imagePreview_el.text
4120 'thumbnail': thumbnail,
4121 'description': description
4125 class SpiegelIE(InfoExtractor):
4126 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4128 def _real_extract(self, url):
4129 m = re.match(self._VALID_URL, url)
4130 video_id = m.group('videoID')
4132 webpage = self._download_webpage(url, video_id)
4133 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4135 raise ExtractorError(u'Cannot find title')
4136 video_title = unescapeHTML(m.group(1))
4138 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4139 xml_code = self._download_webpage(xml_url, video_id,
4140 note=u'Downloading XML', errnote=u'Failed to download XML')
4142 idoc = xml.etree.ElementTree.fromstring(xml_code)
4143 last_type = idoc[-1]
4144 filename = last_type.findall('./filename')[0].text
4145 duration = float(last_type.findall('./duration')[0].text)
4147 video_url = 'http://video2.spiegel.de/flash/' + filename
4148 video_ext = filename.rpartition('.')[2]
4153 'title': video_title,
4154 'duration': duration,
4158 class LiveLeakIE(InfoExtractor):
4160 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4161 IE_NAME = u'liveleak'
4163 def _real_extract(self, url):
4164 mobj = re.match(self._VALID_URL, url)
4166 self._downloader.report_error(u'invalid URL: %s' % url)
4169 video_id = mobj.group('video_id')
4171 webpage = self._download_webpage(url, video_id)
4173 m = re.search(r'file: "(.*?)",', webpage)
4175 self._downloader.report_error(u'unable to find video url')
4177 video_url = m.group(1)
4179 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4181 self._downloader.report_error(u'Cannot find video title')
4182 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4184 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4186 desc = unescapeHTML(m.group('desc'))
4190 m = re.search(r'By:.*?(\w+)</a>', webpage)
4192 uploader = clean_html(m.group(1))
4201 'description': desc,
4202 'uploader': uploader
4207 class ARDIE(InfoExtractor):
4208 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4209 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4210 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4212 def _real_extract(self, url):
4213 # determine video id from url
4214 m = re.match(self._VALID_URL, url)
4216 numid = re.search(r'documentId=([0-9]+)', url)
4218 video_id = numid.group(1)
4220 video_id = m.group('video_id')
4222 # determine title and media streams from webpage
4223 html = self._download_webpage(url, video_id)
4224 title = re.search(self._TITLE, html).group('title')
4225 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4227 assert '"fsk"' in html
4228 self._downloader.report_error(u'this video is only available after 8:00 pm')
4231 # choose default media type and highest quality for now
4232 stream = max([s for s in streams if int(s["media_type"]) == 0],
4233 key=lambda s: int(s["quality"]))
4235 # there's two possibilities: RTMP stream or HTTP download
4236 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4237 if stream['rtmp_url']:
4238 self.to_screen(u'RTMP download detected')
4239 assert stream['video_url'].startswith('mp4:')
4240 info["url"] = stream["rtmp_url"]
4241 info["play_path"] = stream['video_url']
4243 assert stream["video_url"].endswith('.mp4')
4244 info["url"] = stream["video_url"]
4247 class TumblrIE(InfoExtractor):
4248 _VALID_URL = r'http://(?P<blog_name>.*?).tumblr.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4250 def _real_extract(self, url):
4251 m_url = re.match(self._VALID_URL, url)
4252 video_id = m_url.group('id')
4253 blog = m_url.group('blog_name')
4255 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4256 webpage = self._download_webpage(url, video_id)
4258 re_video = r'src=\\x22(?P<video_url>http://%s.tumblr.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4259 video = re.search(re_video, webpage)
4261 self.to_screen("No video founded")
4263 video_url = video.group('video_url')
4264 ext = video.group('ext')
4266 re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22' # We pick the first poster
4267 thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4269 # The only place where you can get a title, it's not complete,
4270 # but searching in other places doesn't work for all videos
4271 re_title = r'<title>(.*?) - (?P<title>.*?)</title>'
4272 title = unescapeHTML(re.search(re_title, webpage).group('title'))
4274 return [{'id': video_id,
4282 def gen_extractors():
4283 """ Return a list of an instance of every supported extractor.
4284 The order does matter; the first extractor matched is the one handling the URL.
4287 YoutubePlaylistIE(),
4312 StanfordOpenClassroomIE(),
4322 WorldStarHipHopIE(),
4339 def get_info_extractor(ie_name):
4340 """Returns the info extractor class with the given ie_name"""
4341 return globals()[ie_name+'IE']