2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 self.report_download_webpage(video_id)
118 elif note is not False:
119 self.to_screen(u'%s: %s' % (video_id, note))
121 return compat_urllib_request.urlopen(url_or_request)
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
124 errnote = u'Unable to download webpage'
125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
127 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns the data of the page as a string """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
133 encoding = m.group(1)
136 webpage_bytes = urlh.read()
137 if self._downloader.params.get('dump_intermediate_pages', False):
139 url = url_or_request.get_full_url()
140 except AttributeError:
142 self.to_screen(u'Dumping request to ' + url)
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
145 return webpage_bytes.decode(encoding, 'replace')
147 def to_screen(self, msg):
148 """Print msg to screen, prefixing it with '[ie_name]'"""
149 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
151 def report_extraction(self, id_or_name):
152 """Report information extraction."""
153 self.to_screen(u'%s: Extracting information' % id_or_name)
155 def report_download_webpage(self, video_id):
156 """Report webpage download."""
157 self.to_screen(u'%s: Downloading webpage' % video_id)
159 def report_age_confirmation(self):
160 """Report attempt to confirm age."""
161 self.to_screen(u'Confirming age')
163 #Methods for following #608
164 #They set the correct value of the '_type' key
165 def video_result(self, video_info):
166 """Returns a video"""
167 video_info['_type'] = 'video'
169 def url_result(self, url, ie=None):
170 """Returns a url that points to a page that should be processed"""
171 #TODO: ie should be the class used for getting the info
172 video_info = {'_type': 'url',
176 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
177 """Returns a playlist"""
178 video_info = {'_type': 'playlist',
181 video_info['id'] = playlist_id
183 video_info['title'] = playlist_title
187 class YoutubeIE(InfoExtractor):
188 """Information extractor for youtube.com."""
192 (?:https?://)? # http(s):// (optional)
193 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
194 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
195 (?:.*?\#/)? # handle anchor (#/) redirect urls
196 (?: # the various things that can precede the ID:
197 (?:(?:v|embed|e)/) # v/ or embed/ or e/
198 |(?: # or the v= param in all its forms
199 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
200 (?:\?|\#!?) # the params delimiter ? or # or #!
201 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
204 )? # optional -> youtube.com/xxxx is OK
205 )? # all until now is optional -> you can pass the naked ID
206 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
207 (?(1).+)? # if we found the ID, everything can follow
209 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
210 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
211 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
212 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
213 _NETRC_MACHINE = 'youtube'
214 # Listed in order of quality
215 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
216 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
217 _video_extensions = {
223 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
229 _video_dimensions = {
248 def suitable(cls, url):
249 """Receives a URL and returns True if suitable for this IE."""
250 if YoutubePlaylistIE.suitable(url): return False
251 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
253 def report_lang(self):
254 """Report attempt to set language."""
255 self.to_screen(u'Setting language')
257 def report_login(self):
258 """Report attempt to log in."""
259 self.to_screen(u'Logging in')
261 def report_video_webpage_download(self, video_id):
262 """Report attempt to download video webpage."""
263 self.to_screen(u'%s: Downloading video webpage' % video_id)
265 def report_video_info_webpage_download(self, video_id):
266 """Report attempt to download video info webpage."""
267 self.to_screen(u'%s: Downloading video info webpage' % video_id)
269 def report_video_subtitles_download(self, video_id):
270 """Report attempt to download video info webpage."""
271 self.to_screen(u'%s: Checking available subtitles' % video_id)
273 def report_video_subtitles_request(self, video_id, sub_lang, format):
274 """Report attempt to download video info webpage."""
275 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
277 def report_video_subtitles_available(self, video_id, sub_lang_list):
278 """Report available subtitles."""
279 sub_lang = ",".join(list(sub_lang_list.keys()))
280 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
282 def report_information_extraction(self, video_id):
283 """Report attempt to extract video information."""
284 self.to_screen(u'%s: Extracting video information' % video_id)
286 def report_unavailable_format(self, video_id, format):
287 """Report extracted video URL."""
288 self.to_screen(u'%s: Format %s not available' % (video_id, format))
290 def report_rtmp_download(self):
291 """Indicate the download will use the RTMP protocol."""
292 self.to_screen(u'RTMP download detected')
294 def _get_available_subtitles(self, video_id):
295 self.report_video_subtitles_download(video_id)
296 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
298 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
299 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
300 return (u'unable to download video subtitles: %s' % compat_str(err), None)
301 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
302 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
303 if not sub_lang_list:
304 return (u'video doesn\'t have subtitles', None)
307 def _list_available_subtitles(self, video_id):
308 sub_lang_list = self._get_available_subtitles(video_id)
309 self.report_video_subtitles_available(video_id, sub_lang_list)
311 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
314 (error_message, sub_lang, sub)
316 self.report_video_subtitles_request(video_id, sub_lang, format)
317 params = compat_urllib_parse.urlencode({
323 url = 'http://www.youtube.com/api/timedtext?' + params
325 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
326 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
327 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
329 return (u'Did not fetch video subtitles', None, None)
330 return (None, sub_lang, sub)
332 def _extract_subtitle(self, video_id):
334 Return a list with a tuple:
335 [(error_message, sub_lang, sub)]
337 sub_lang_list = self._get_available_subtitles(video_id)
338 sub_format = self._downloader.params.get('subtitlesformat')
339 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
340 return [(sub_lang_list[0], None, None)]
341 if self._downloader.params.get('subtitleslang', False):
342 sub_lang = self._downloader.params.get('subtitleslang')
343 elif 'en' in sub_lang_list:
346 sub_lang = list(sub_lang_list.keys())[0]
347 if not sub_lang in sub_lang_list:
348 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
350 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
353 def _extract_all_subtitles(self, video_id):
354 sub_lang_list = self._get_available_subtitles(video_id)
355 sub_format = self._downloader.params.get('subtitlesformat')
356 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
357 return [(sub_lang_list[0], None, None)]
359 for sub_lang in sub_lang_list:
360 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
361 subtitles.append(subtitle)
364 def _print_formats(self, formats):
365 print('Available formats:')
367 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
369 def _real_initialize(self):
370 if self._downloader is None:
375 downloader_params = self._downloader.params
377 # Attempt to use provided username and password or .netrc data
378 if downloader_params.get('username', None) is not None:
379 username = downloader_params['username']
380 password = downloader_params['password']
381 elif downloader_params.get('usenetrc', False):
383 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
388 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
389 except (IOError, netrc.NetrcParseError) as err:
390 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
394 request = compat_urllib_request.Request(self._LANG_URL)
397 compat_urllib_request.urlopen(request).read()
398 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
399 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
402 # No authentication to be performed
406 request = compat_urllib_request.Request(self._LOGIN_URL)
408 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
409 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
410 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
415 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
417 galx = match.group(1)
419 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
425 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
429 u'PersistentCookie': u'yes',
431 u'bgresponse': u'js_disabled',
432 u'checkConnection': u'',
433 u'checkedDomains': u'youtube',
439 u'signIn': u'Sign in',
441 u'service': u'youtube',
445 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
447 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
448 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
449 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
452 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
453 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
454 self._downloader.report_warning(u'unable to log in: bad username or password')
456 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
457 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
463 'action_confirm': 'Confirm',
465 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
467 self.report_age_confirmation()
468 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
469 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
470 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
473 def _extract_id(self, url):
474 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
476 self._downloader.report_error(u'invalid URL: %s' % url)
478 video_id = mobj.group(2)
481 def _real_extract(self, url):
482 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
483 mobj = re.search(self._NEXT_URL_RE, url)
485 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
486 video_id = self._extract_id(url)
489 self.report_video_webpage_download(video_id)
490 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
491 request = compat_urllib_request.Request(url)
493 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
494 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
495 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
498 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
500 # Attempt to extract SWF player URL
501 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
503 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
508 self.report_video_info_webpage_download(video_id)
509 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
510 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
511 % (video_id, el_type))
512 video_info_webpage = self._download_webpage(video_info_url, video_id,
514 errnote='unable to download video info webpage')
515 video_info = compat_parse_qs(video_info_webpage)
516 if 'token' in video_info:
518 if 'token' not in video_info:
519 if 'reason' in video_info:
520 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
522 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
525 # Check for "rental" videos
526 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
527 self._downloader.report_error(u'"rental" videos not supported')
530 # Start extracting information
531 self.report_information_extraction(video_id)
534 if 'author' not in video_info:
535 self._downloader.report_error(u'unable to extract uploader name')
537 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
540 video_uploader_id = None
541 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
543 video_uploader_id = mobj.group(1)
545 self._downloader.report_warning(u'unable to extract uploader nickname')
548 if 'title' not in video_info:
549 self._downloader.report_error(u'unable to extract video title')
551 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
554 if 'thumbnail_url' not in video_info:
555 self._downloader.report_warning(u'unable to extract video thumbnail')
557 else: # don't panic if we can't find it
558 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
562 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
564 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
565 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
566 for expression in format_expressions:
568 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
573 video_description = get_element_by_id("eow-description", video_webpage)
574 if video_description:
575 video_description = clean_html(video_description)
577 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
579 video_description = unescapeHTML(fd_mobj.group(1))
581 video_description = u''
584 video_subtitles = None
586 if self._downloader.params.get('writesubtitles', False):
587 video_subtitles = self._extract_subtitle(video_id)
589 (sub_error, sub_lang, sub) = video_subtitles[0]
591 self._downloader.report_error(sub_error)
593 if self._downloader.params.get('allsubtitles', False):
594 video_subtitles = self._extract_all_subtitles(video_id)
595 for video_subtitle in video_subtitles:
596 (sub_error, sub_lang, sub) = video_subtitle
598 self._downloader.report_error(sub_error)
600 if self._downloader.params.get('listsubtitles', False):
601 sub_lang_list = self._list_available_subtitles(video_id)
604 if 'length_seconds' not in video_info:
605 self._downloader.report_warning(u'unable to extract video duration')
608 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
611 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
613 # Decide which formats to download
614 req_format = self._downloader.params.get('format', None)
616 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
617 self.report_rtmp_download()
618 video_url_list = [(None, video_info['conn'][0])]
619 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
620 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
621 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
622 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
623 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
625 format_limit = self._downloader.params.get('format_limit', None)
626 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
627 if format_limit is not None and format_limit in available_formats:
628 format_list = available_formats[available_formats.index(format_limit):]
630 format_list = available_formats
631 existing_formats = [x for x in format_list if x in url_map]
632 if len(existing_formats) == 0:
633 raise ExtractorError(u'no known formats available for video')
634 if self._downloader.params.get('listformats', None):
635 self._print_formats(existing_formats)
637 if req_format is None or req_format == 'best':
638 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
639 elif req_format == 'worst':
640 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
641 elif req_format in ('-1', 'all'):
642 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
644 # Specific formats. We pick the first in a slash-delimeted sequence.
645 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
646 req_formats = req_format.split('/')
647 video_url_list = None
648 for rf in req_formats:
650 video_url_list = [(rf, url_map[rf])]
652 if video_url_list is None:
653 raise ExtractorError(u'requested format not available')
655 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
658 for format_param, video_real_url in video_url_list:
660 video_extension = self._video_extensions.get(format_param, 'flv')
662 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
663 self._video_dimensions.get(format_param, '???'))
667 'url': video_real_url,
668 'uploader': video_uploader,
669 'uploader_id': video_uploader_id,
670 'upload_date': upload_date,
671 'title': video_title,
672 'ext': video_extension,
673 'format': video_format,
674 'thumbnail': video_thumbnail,
675 'description': video_description,
676 'player_url': player_url,
677 'subtitles': video_subtitles,
678 'duration': video_duration
683 class MetacafeIE(InfoExtractor):
684 """Information Extractor for metacafe.com."""
686 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
687 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
688 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
689 IE_NAME = u'metacafe'
691 def report_disclaimer(self):
692 """Report disclaimer retrieval."""
693 self.to_screen(u'Retrieving disclaimer')
695 def _real_initialize(self):
696 # Retrieve disclaimer
697 request = compat_urllib_request.Request(self._DISCLAIMER)
699 self.report_disclaimer()
700 disclaimer = compat_urllib_request.urlopen(request).read()
701 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
702 self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
708 'submit': "Continue - I'm over 18",
710 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
712 self.report_age_confirmation()
713 disclaimer = compat_urllib_request.urlopen(request).read()
714 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
715 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
718 def _real_extract(self, url):
719 # Extract id and simplified title from URL
720 mobj = re.match(self._VALID_URL, url)
722 self._downloader.report_error(u'invalid URL: %s' % url)
725 video_id = mobj.group(1)
727 # Check if video comes from YouTube
728 mobj2 = re.match(r'^yt-(.*)$', video_id)
729 if mobj2 is not None:
730 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
732 # Retrieve video webpage to extract further information
733 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
735 # Extract URL, uploader and title from webpage
736 self.report_extraction(video_id)
737 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
739 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
740 video_extension = mediaURL[-3:]
742 # Extract gdaKey if available
743 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
747 gdaKey = mobj.group(1)
748 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
750 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
752 self._downloader.report_error(u'unable to extract media URL')
754 vardict = compat_parse_qs(mobj.group(1))
755 if 'mediaData' not in vardict:
756 self._downloader.report_error(u'unable to extract media URL')
758 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
760 self._downloader.report_error(u'unable to extract media URL')
762 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
763 video_extension = mediaURL[-3:]
764 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
766 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
768 self._downloader.report_error(u'unable to extract title')
770 video_title = mobj.group(1).decode('utf-8')
772 mobj = re.search(r'submitter=(.*?);', webpage)
774 self._downloader.report_error(u'unable to extract uploader nickname')
776 video_uploader = mobj.group(1)
779 'id': video_id.decode('utf-8'),
780 'url': video_url.decode('utf-8'),
781 'uploader': video_uploader.decode('utf-8'),
783 'title': video_title,
784 'ext': video_extension.decode('utf-8'),
788 class DailymotionIE(InfoExtractor):
789 """Information Extractor for Dailymotion"""
791 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
792 IE_NAME = u'dailymotion'
795 def _real_extract(self, url):
796 # Extract id and simplified title from URL
797 mobj = re.match(self._VALID_URL, url)
799 self._downloader.report_error(u'invalid URL: %s' % url)
802 video_id = mobj.group(1).split('_')[0].split('?')[0]
804 video_extension = 'mp4'
806 # Retrieve video webpage to extract further information
807 request = compat_urllib_request.Request(url)
808 request.add_header('Cookie', 'family_filter=off')
809 webpage = self._download_webpage(request, video_id)
811 # Extract URL, uploader and title from webpage
812 self.report_extraction(video_id)
813 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
815 self._downloader.report_error(u'unable to extract media URL')
817 flashvars = compat_urllib_parse.unquote(mobj.group(1))
819 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
822 self.to_screen(u'Using %s' % key)
825 self._downloader.report_error(u'unable to extract video URL')
828 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
830 self._downloader.report_error(u'unable to extract video URL')
833 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
835 # TODO: support choosing qualities
837 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
839 self._downloader.report_error(u'unable to extract title')
841 video_title = unescapeHTML(mobj.group('title'))
843 video_uploader = None
844 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
846 # lookin for official user
847 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
848 if mobj_official is None:
849 self._downloader.report_warning(u'unable to extract uploader nickname')
851 video_uploader = mobj_official.group(1)
853 video_uploader = mobj.group(1)
855 video_upload_date = None
856 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
858 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
863 'uploader': video_uploader,
864 'upload_date': video_upload_date,
865 'title': video_title,
866 'ext': video_extension,
870 class PhotobucketIE(InfoExtractor):
871 """Information extractor for photobucket.com."""
873 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
874 IE_NAME = u'photobucket'
876 def _real_extract(self, url):
877 # Extract id from URL
878 mobj = re.match(self._VALID_URL, url)
880 self._downloader.report_error(u'Invalid URL: %s' % url)
883 video_id = mobj.group(1)
885 video_extension = 'flv'
887 # Retrieve video webpage to extract further information
888 request = compat_urllib_request.Request(url)
890 self.report_download_webpage(video_id)
891 webpage = compat_urllib_request.urlopen(request).read()
892 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
893 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
896 # Extract URL, uploader, and title from webpage
897 self.report_extraction(video_id)
898 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
900 self._downloader.report_error(u'unable to extract media URL')
902 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
906 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
908 self._downloader.report_error(u'unable to extract title')
910 video_title = mobj.group(1).decode('utf-8')
912 video_uploader = mobj.group(2).decode('utf-8')
915 'id': video_id.decode('utf-8'),
916 'url': video_url.decode('utf-8'),
917 'uploader': video_uploader,
919 'title': video_title,
920 'ext': video_extension.decode('utf-8'),
924 class YahooIE(InfoExtractor):
925 """Information extractor for video.yahoo.com."""
928 # _VALID_URL matches all Yahoo! Video URLs
929 # _VPAGE_URL matches only the extractable '/watch/' URLs
930 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
931 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
932 IE_NAME = u'video.yahoo'
934 def _real_extract(self, url, new_video=True):
935 # Extract ID from URL
936 mobj = re.match(self._VALID_URL, url)
938 self._downloader.report_error(u'Invalid URL: %s' % url)
941 video_id = mobj.group(2)
942 video_extension = 'flv'
944 # Rewrite valid but non-extractable URLs as
945 # extractable English language /watch/ URLs
946 if re.match(self._VPAGE_URL, url) is None:
947 request = compat_urllib_request.Request(url)
949 webpage = compat_urllib_request.urlopen(request).read()
950 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
951 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
954 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
956 self._downloader.report_error(u'Unable to extract id field')
958 yahoo_id = mobj.group(1)
960 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
962 self._downloader.report_error(u'Unable to extract vid field')
964 yahoo_vid = mobj.group(1)
966 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
967 return self._real_extract(url, new_video=False)
969 # Retrieve video webpage to extract further information
970 request = compat_urllib_request.Request(url)
972 self.report_download_webpage(video_id)
973 webpage = compat_urllib_request.urlopen(request).read()
974 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
975 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
978 # Extract uploader and title from webpage
979 self.report_extraction(video_id)
980 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
982 self._downloader.report_error(u'unable to extract video title')
984 video_title = mobj.group(1).decode('utf-8')
986 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
988 self._downloader.report_error(u'unable to extract video uploader')
990 video_uploader = mobj.group(1).decode('utf-8')
992 # Extract video thumbnail
993 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
995 self._downloader.report_error(u'unable to extract video thumbnail')
997 video_thumbnail = mobj.group(1).decode('utf-8')
999 # Extract video description
1000 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1002 self._downloader.report_error(u'unable to extract video description')
1004 video_description = mobj.group(1).decode('utf-8')
1005 if not video_description:
1006 video_description = 'No description available.'
1008 # Extract video height and width
1009 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1011 self._downloader.report_error(u'unable to extract video height')
1013 yv_video_height = mobj.group(1)
1015 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1017 self._downloader.report_error(u'unable to extract video width')
1019 yv_video_width = mobj.group(1)
1021 # Retrieve video playlist to extract media URL
1022 # I'm not completely sure what all these options are, but we
1023 # seem to need most of them, otherwise the server sends a 401.
1024 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1025 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1026 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1027 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1028 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1030 self.report_download_webpage(video_id)
1031 webpage = compat_urllib_request.urlopen(request).read()
1032 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1033 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1036 # Extract media URL from playlist XML
1037 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1039 self._downloader.report_error(u'Unable to extract media URL')
1041 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1042 video_url = unescapeHTML(video_url)
1045 'id': video_id.decode('utf-8'),
1047 'uploader': video_uploader,
1048 'upload_date': None,
1049 'title': video_title,
1050 'ext': video_extension.decode('utf-8'),
1051 'thumbnail': video_thumbnail.decode('utf-8'),
1052 'description': video_description,
1056 class VimeoIE(InfoExtractor):
1057 """Information extractor for vimeo.com."""
1059 # _VALID_URL matches Vimeo URLs
1060 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1063 def _real_extract(self, url, new_video=True):
1064 # Extract ID from URL
1065 mobj = re.match(self._VALID_URL, url)
1067 self._downloader.report_error(u'Invalid URL: %s' % url)
1070 video_id = mobj.group('id')
1071 if not mobj.group('proto'):
1072 url = 'https://' + url
1073 if mobj.group('direct_link'):
1074 url = 'https://vimeo.com/' + video_id
1076 # Retrieve video webpage to extract further information
1077 request = compat_urllib_request.Request(url, None, std_headers)
1079 self.report_download_webpage(video_id)
1080 webpage_bytes = compat_urllib_request.urlopen(request).read()
1081 webpage = webpage_bytes.decode('utf-8')
1082 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1083 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1086 # Now we begin extracting as much information as we can from what we
1087 # retrieved. First we extract the information common to all extractors,
1088 # and latter we extract those that are Vimeo specific.
1089 self.report_extraction(video_id)
1091 # Extract the config JSON
1093 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1094 config = json.loads(config)
1096 self._downloader.report_error(u'unable to extract info section')
1100 video_title = config["video"]["title"]
1102 # Extract uploader and uploader_id
1103 video_uploader = config["video"]["owner"]["name"]
1104 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1106 # Extract video thumbnail
1107 video_thumbnail = config["video"]["thumbnail"]
1109 # Extract video description
1110 video_description = get_element_by_attribute("itemprop", "description", webpage)
1111 if video_description: video_description = clean_html(video_description)
1112 else: video_description = u''
1114 # Extract upload date
1115 video_upload_date = None
1116 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1117 if mobj is not None:
1118 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1120 # Vimeo specific: extract request signature and timestamp
1121 sig = config['request']['signature']
1122 timestamp = config['request']['timestamp']
1124 # Vimeo specific: extract video codec and quality information
1125 # First consider quality, then codecs, then take everything
1126 # TODO bind to format param
1127 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1128 files = { 'hd': [], 'sd': [], 'other': []}
1129 for codec_name, codec_extension in codecs:
1130 if codec_name in config["video"]["files"]:
1131 if 'hd' in config["video"]["files"][codec_name]:
1132 files['hd'].append((codec_name, codec_extension, 'hd'))
1133 elif 'sd' in config["video"]["files"][codec_name]:
1134 files['sd'].append((codec_name, codec_extension, 'sd'))
1136 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1138 for quality in ('hd', 'sd', 'other'):
1139 if len(files[quality]) > 0:
1140 video_quality = files[quality][0][2]
1141 video_codec = files[quality][0][0]
1142 video_extension = files[quality][0][1]
1143 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1146 self._downloader.report_error(u'no known codec found')
1149 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1150 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1155 'uploader': video_uploader,
1156 'uploader_id': video_uploader_id,
1157 'upload_date': video_upload_date,
1158 'title': video_title,
1159 'ext': video_extension,
1160 'thumbnail': video_thumbnail,
1161 'description': video_description,
1165 class ArteTvIE(InfoExtractor):
1166 """arte.tv information extractor."""
1168 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1169 _LIVE_URL = r'index-[0-9]+\.html$'
1171 IE_NAME = u'arte.tv'
1173 def fetch_webpage(self, url):
1174 request = compat_urllib_request.Request(url)
1176 self.report_download_webpage(url)
1177 webpage = compat_urllib_request.urlopen(request).read()
1178 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1179 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1181 except ValueError as err:
1182 self._downloader.report_error(u'Invalid URL: %s' % url)
1186 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1187 page = self.fetch_webpage(url)
1188 mobj = re.search(regex, page, regexFlags)
1192 self._downloader.report_error(u'Invalid URL: %s' % url)
1195 for (i, key, err) in matchTuples:
1196 if mobj.group(i) is None:
1197 self._downloader.report_error(err)
1200 info[key] = mobj.group(i)
1204 def extractLiveStream(self, url):
1205 video_lang = url.split('/')[-4]
1206 info = self.grep_webpage(
1208 r'src="(.*?/videothek_js.*?\.js)',
1211 (1, 'url', u'Invalid URL: %s' % url)
1214 http_host = url.split('/')[2]
1215 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1216 info = self.grep_webpage(
1218 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1219 '(http://.*?\.swf).*?' +
1223 (1, 'path', u'could not extract video path: %s' % url),
1224 (2, 'player', u'could not extract video player: %s' % url),
1225 (3, 'url', u'could not extract video url: %s' % url)
1228 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1230 def extractPlus7Stream(self, url):
1231 video_lang = url.split('/')[-3]
1232 info = self.grep_webpage(
1234 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1237 (1, 'url', u'Invalid URL: %s' % url)
1240 next_url = compat_urllib_parse.unquote(info.get('url'))
1241 info = self.grep_webpage(
1243 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1246 (1, 'url', u'Could not find <video> tag: %s' % url)
1249 next_url = compat_urllib_parse.unquote(info.get('url'))
1251 info = self.grep_webpage(
1253 r'<video id="(.*?)".*?>.*?' +
1254 '<name>(.*?)</name>.*?' +
1255 '<dateVideo>(.*?)</dateVideo>.*?' +
1256 '<url quality="hd">(.*?)</url>',
1259 (1, 'id', u'could not extract video id: %s' % url),
1260 (2, 'title', u'could not extract video title: %s' % url),
1261 (3, 'date', u'could not extract video date: %s' % url),
1262 (4, 'url', u'could not extract video url: %s' % url)
1267 'id': info.get('id'),
1268 'url': compat_urllib_parse.unquote(info.get('url')),
1269 'uploader': u'arte.tv',
1270 'upload_date': info.get('date'),
1271 'title': info.get('title').decode('utf-8'),
1277 def _real_extract(self, url):
1278 video_id = url.split('/')[-1]
1279 self.report_extraction(video_id)
1281 if re.search(self._LIVE_URL, video_id) is not None:
1282 self.extractLiveStream(url)
1285 info = self.extractPlus7Stream(url)
1290 class GenericIE(InfoExtractor):
1291 """Generic last-resort information extractor."""
1294 IE_NAME = u'generic'
1296 def report_download_webpage(self, video_id):
1297 """Report webpage download."""
1298 if not self._downloader.params.get('test', False):
1299 self._downloader.report_warning(u'Falling back on generic information extractor.')
1300 super(GenericIE, self).report_download_webpage(video_id)
1302 def report_following_redirect(self, new_url):
1303 """Report information extraction."""
1304 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1306 def _test_redirect(self, url):
1307 """Check if it is a redirect, like url shorteners, in case return the new url."""
1308 class HeadRequest(compat_urllib_request.Request):
1309 def get_method(self):
1312 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1314 Subclass the HTTPRedirectHandler to make it use our
1315 HeadRequest also on the redirected URL
1317 def redirect_request(self, req, fp, code, msg, headers, newurl):
1318 if code in (301, 302, 303, 307):
1319 newurl = newurl.replace(' ', '%20')
1320 newheaders = dict((k,v) for k,v in req.headers.items()
1321 if k.lower() not in ("content-length", "content-type"))
1322 return HeadRequest(newurl,
1324 origin_req_host=req.get_origin_req_host(),
1327 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1329 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1331 Fallback to GET if HEAD is not allowed (405 HTTP error)
1333 def http_error_405(self, req, fp, code, msg, headers):
1337 newheaders = dict((k,v) for k,v in req.headers.items()
1338 if k.lower() not in ("content-length", "content-type"))
1339 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1341 origin_req_host=req.get_origin_req_host(),
1345 opener = compat_urllib_request.OpenerDirector()
1346 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1347 HTTPMethodFallback, HEADRedirectHandler,
1348 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1349 opener.add_handler(handler())
1351 response = opener.open(HeadRequest(url))
1352 new_url = response.geturl()
1357 self.report_following_redirect(new_url)
1360 def _real_extract(self, url):
1361 new_url = self._test_redirect(url)
1362 if new_url: return [self.url_result(new_url)]
1364 video_id = url.split('/')[-1]
1366 webpage = self._download_webpage(url, video_id)
1367 except ValueError as err:
1368 # since this is the last-resort InfoExtractor, if
1369 # this error is thrown, it'll be thrown here
1370 self._downloader.report_error(u'Invalid URL: %s' % url)
1373 self.report_extraction(video_id)
1374 # Start with something easy: JW Player in SWFObject
1375 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1377 # Broaden the search a little bit
1378 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1380 # Broaden the search a little bit: JWPlayer JS loader
1381 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1383 self._downloader.report_error(u'Invalid URL: %s' % url)
1386 # It's possible that one of the regexes
1387 # matched, but returned an empty group:
1388 if mobj.group(1) is None:
1389 self._downloader.report_error(u'Invalid URL: %s' % url)
1392 video_url = compat_urllib_parse.unquote(mobj.group(1))
1393 video_id = os.path.basename(video_url)
1395 # here's a fun little line of code for you:
1396 video_extension = os.path.splitext(video_id)[1][1:]
1397 video_id = os.path.splitext(video_id)[0]
1399 # it's tempting to parse this further, but you would
1400 # have to take into account all the variations like
1401 # Video Title - Site Name
1402 # Site Name | Video Title
1403 # Video Title - Tagline | Site Name
1404 # and so on and so forth; it's just not practical
1405 mobj = re.search(r'<title>(.*)</title>', webpage)
1407 self._downloader.report_error(u'unable to extract title')
1409 video_title = mobj.group(1)
1411 # video uploader is domain name
1412 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1414 self._downloader.report_error(u'unable to extract title')
1416 video_uploader = mobj.group(1)
1421 'uploader': video_uploader,
1422 'upload_date': None,
1423 'title': video_title,
1424 'ext': video_extension,
1428 class YoutubeSearchIE(InfoExtractor):
1429 """Information Extractor for YouTube search queries."""
1430 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1431 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1432 _max_youtube_results = 1000
1433 IE_NAME = u'youtube:search'
1435 def report_download_page(self, query, pagenum):
1436 """Report attempt to download search page with given number."""
1437 query = query.decode(preferredencoding())
1438 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1440 def _real_extract(self, query):
1441 mobj = re.match(self._VALID_URL, query)
1443 self._downloader.report_error(u'invalid search query "%s"' % query)
1446 prefix, query = query.split(':')
1448 query = query.encode('utf-8')
1450 return self._get_n_results(query, 1)
1451 elif prefix == 'all':
1452 self._get_n_results(query, self._max_youtube_results)
1457 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1459 elif n > self._max_youtube_results:
1460 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1461 n = self._max_youtube_results
1462 return self._get_n_results(query, n)
1463 except ValueError: # parsing prefix as integer fails
1464 return self._get_n_results(query, 1)
1466 def _get_n_results(self, query, n):
1467 """Get a specified number of results for a query"""
1473 while (50 * pagenum) < limit:
1474 self.report_download_page(query, pagenum+1)
1475 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1476 request = compat_urllib_request.Request(result_url)
1478 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1479 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1480 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1482 api_response = json.loads(data)['data']
1484 if not 'items' in api_response:
1485 self._downloader.report_error(u'[youtube] No video results')
1488 new_ids = list(video['id'] for video in api_response['items'])
1489 video_ids += new_ids
1491 limit = min(n, api_response['totalItems'])
1494 if len(video_ids) > n:
1495 video_ids = video_ids[:n]
1496 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1500 class GoogleSearchIE(InfoExtractor):
1501 """Information Extractor for Google Video search queries."""
1502 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1503 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1504 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1505 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1506 _max_google_results = 1000
1507 IE_NAME = u'video.google:search'
1509 def report_download_page(self, query, pagenum):
1510 """Report attempt to download playlist page with given number."""
1511 query = query.decode(preferredencoding())
1512 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1514 def _real_extract(self, query):
1515 mobj = re.match(self._VALID_URL, query)
1517 self._downloader.report_error(u'invalid search query "%s"' % query)
1520 prefix, query = query.split(':')
1522 query = query.encode('utf-8')
1524 self._download_n_results(query, 1)
1526 elif prefix == 'all':
1527 self._download_n_results(query, self._max_google_results)
1533 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1535 elif n > self._max_google_results:
1536 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1537 n = self._max_google_results
1538 self._download_n_results(query, n)
1540 except ValueError: # parsing prefix as integer fails
1541 self._download_n_results(query, 1)
1544 def _download_n_results(self, query, n):
1545 """Downloads a specified number of results for a query"""
1551 self.report_download_page(query, pagenum)
1552 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1553 request = compat_urllib_request.Request(result_url)
1555 page = compat_urllib_request.urlopen(request).read()
1556 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1557 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1560 # Extract video identifiers
1561 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1562 video_id = mobj.group(1)
1563 if video_id not in video_ids:
1564 video_ids.append(video_id)
1565 if len(video_ids) == n:
1566 # Specified n videos reached
1567 for id in video_ids:
1568 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1571 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1572 for id in video_ids:
1573 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1576 pagenum = pagenum + 1
1579 class YahooSearchIE(InfoExtractor):
1580 """Information Extractor for Yahoo! Video search queries."""
1583 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1584 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1585 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1586 _MORE_PAGES_INDICATOR = r'\s*Next'
1587 _max_yahoo_results = 1000
1588 IE_NAME = u'video.yahoo:search'
1590 def report_download_page(self, query, pagenum):
1591 """Report attempt to download playlist page with given number."""
1592 query = query.decode(preferredencoding())
1593 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1595 def _real_extract(self, query):
1596 mobj = re.match(self._VALID_URL, query)
1598 self._downloader.report_error(u'invalid search query "%s"' % query)
1601 prefix, query = query.split(':')
1603 query = query.encode('utf-8')
1605 self._download_n_results(query, 1)
1607 elif prefix == 'all':
1608 self._download_n_results(query, self._max_yahoo_results)
1614 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1616 elif n > self._max_yahoo_results:
1617 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1618 n = self._max_yahoo_results
1619 self._download_n_results(query, n)
1621 except ValueError: # parsing prefix as integer fails
1622 self._download_n_results(query, 1)
1625 def _download_n_results(self, query, n):
1626 """Downloads a specified number of results for a query"""
1629 already_seen = set()
1633 self.report_download_page(query, pagenum)
1634 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1635 request = compat_urllib_request.Request(result_url)
1637 page = compat_urllib_request.urlopen(request).read()
1638 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1639 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1642 # Extract video identifiers
1643 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1644 video_id = mobj.group(1)
1645 if video_id not in already_seen:
1646 video_ids.append(video_id)
1647 already_seen.add(video_id)
1648 if len(video_ids) == n:
1649 # Specified n videos reached
1650 for id in video_ids:
1651 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1654 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1655 for id in video_ids:
1656 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1659 pagenum = pagenum + 1
1662 class YoutubePlaylistIE(InfoExtractor):
1663 """Information Extractor for YouTube playlists."""
1665 _VALID_URL = r"""(?:
1670 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1671 \? (?:.*?&)*? (?:p|a|list)=
1674 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1677 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1679 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1681 IE_NAME = u'youtube:playlist'
1684 def suitable(cls, url):
1685 """Receives a URL and returns True if suitable for this IE."""
1686 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1688 def report_download_page(self, playlist_id, pagenum):
1689 """Report attempt to download playlist page with given number."""
1690 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1692 def _real_extract(self, url):
1693 # Extract playlist id
1694 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1696 self._downloader.report_error(u'invalid url: %s' % url)
1699 # Download playlist videos from API
1700 playlist_id = mobj.group(1) or mobj.group(2)
1705 self.report_download_page(playlist_id, page_num)
1707 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1709 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1710 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1711 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1715 response = json.loads(page)
1716 except ValueError as err:
1717 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1720 if 'feed' not in response:
1721 self._downloader.report_error(u'Got a malformed response from YouTube API')
1723 if 'entry' not in response['feed']:
1724 # Number of videos is a multiple of self._MAX_RESULTS
1727 playlist_title = response['feed']['title']['$t']
1729 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1730 for entry in response['feed']['entry']
1731 if 'content' in entry ]
1733 if len(response['feed']['entry']) < self._MAX_RESULTS:
1737 videos = [v[1] for v in sorted(videos)]
1739 url_results = [self.url_result(url, 'Youtube') for url in videos]
1740 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1743 class YoutubeChannelIE(InfoExtractor):
1744 """Information Extractor for YouTube channels."""
1746 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1747 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1748 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1749 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1750 IE_NAME = u'youtube:channel'
1752 def report_download_page(self, channel_id, pagenum):
1753 """Report attempt to download channel page with given number."""
1754 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1756 def extract_videos_from_page(self, page):
1758 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1759 if mobj.group(1) not in ids_in_page:
1760 ids_in_page.append(mobj.group(1))
1763 def _real_extract(self, url):
1764 # Extract channel id
1765 mobj = re.match(self._VALID_URL, url)
1767 self._downloader.report_error(u'invalid url: %s' % url)
1770 # Download channel page
1771 channel_id = mobj.group(1)
1775 self.report_download_page(channel_id, pagenum)
1776 url = self._TEMPLATE_URL % (channel_id, pagenum)
1777 request = compat_urllib_request.Request(url)
1779 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1780 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1781 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1784 # Extract video identifiers
1785 ids_in_page = self.extract_videos_from_page(page)
1786 video_ids.extend(ids_in_page)
1788 # Download any subsequent channel pages using the json-based channel_ajax query
1789 if self._MORE_PAGES_INDICATOR in page:
1791 pagenum = pagenum + 1
1793 self.report_download_page(channel_id, pagenum)
1794 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1795 request = compat_urllib_request.Request(url)
1797 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1798 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1799 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1802 page = json.loads(page)
1804 ids_in_page = self.extract_videos_from_page(page['content_html'])
1805 video_ids.extend(ids_in_page)
1807 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1810 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1812 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1813 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1814 return [self.playlist_result(url_entries, channel_id)]
1817 class YoutubeUserIE(InfoExtractor):
1818 """Information Extractor for YouTube users."""
1820 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1821 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1822 _GDATA_PAGE_SIZE = 50
1823 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1824 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1825 IE_NAME = u'youtube:user'
1827 def report_download_page(self, username, start_index):
1828 """Report attempt to download user page."""
1829 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1830 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1832 def _real_extract(self, url):
1834 mobj = re.match(self._VALID_URL, url)
1836 self._downloader.report_error(u'invalid url: %s' % url)
1839 username = mobj.group(1)
1841 # Download video ids using YouTube Data API. Result size per
1842 # query is limited (currently to 50 videos) so we need to query
1843 # page by page until there are no video ids - it means we got
1850 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1851 self.report_download_page(username, start_index)
1853 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1856 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1857 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1858 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1861 # Extract video identifiers
1864 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1865 if mobj.group(1) not in ids_in_page:
1866 ids_in_page.append(mobj.group(1))
1868 video_ids.extend(ids_in_page)
1870 # A little optimization - if current page is not
1871 # "full", ie. does not contain PAGE_SIZE video ids then
1872 # we can assume that this page is the last one - there
1873 # are no more ids on further pages - no need to query
1876 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1881 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1882 url_results = [self.url_result(url, 'Youtube') for url in urls]
1883 return [self.playlist_result(url_results, playlist_title = username)]
1886 class BlipTVUserIE(InfoExtractor):
1887 """Information Extractor for blip.tv users."""
1889 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1891 IE_NAME = u'blip.tv:user'
1893 def report_download_page(self, username, pagenum):
1894 """Report attempt to download user page."""
1895 self.to_screen(u'user %s: Downloading video ids from page %d' %
1896 (username, pagenum))
1898 def _real_extract(self, url):
1900 mobj = re.match(self._VALID_URL, url)
1902 self._downloader.report_error(u'invalid url: %s' % url)
1905 username = mobj.group(1)
1907 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1909 request = compat_urllib_request.Request(url)
1912 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1913 mobj = re.search(r'data-users-id="([^"]+)"', page)
1914 page_base = page_base % mobj.group(1)
1915 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1916 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1920 # Download video ids using BlipTV Ajax calls. Result size per
1921 # query is limited (currently to 12 videos) so we need to query
1922 # page by page until there are no video ids - it means we got
1929 self.report_download_page(username, pagenum)
1930 url = page_base + "&page=" + str(pagenum)
1931 request = compat_urllib_request.Request( url )
1933 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1934 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1935 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
1938 # Extract video identifiers
1941 for mobj in re.finditer(r'href="/([^"]+)"', page):
1942 if mobj.group(1) not in ids_in_page:
1943 ids_in_page.append(unescapeHTML(mobj.group(1)))
1945 video_ids.extend(ids_in_page)
1947 # A little optimization - if current page is not
1948 # "full", ie. does not contain PAGE_SIZE video ids then
1949 # we can assume that this page is the last one - there
1950 # are no more ids on further pages - no need to query
1953 if len(ids_in_page) < self._PAGE_SIZE:
1958 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1959 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1960 return [self.playlist_result(url_entries, playlist_title = username)]
1963 class DepositFilesIE(InfoExtractor):
1964 """Information extractor for depositfiles.com"""
1966 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1968 def _real_extract(self, url):
1969 file_id = url.split('/')[-1]
1970 # Rebuild url in english locale
1971 url = 'http://depositfiles.com/en/files/' + file_id
1973 # Retrieve file webpage with 'Free download' button pressed
1974 free_download_indication = { 'gateway_result' : '1' }
1975 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1977 self.report_download_webpage(file_id)
1978 webpage = compat_urllib_request.urlopen(request).read()
1979 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1980 self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
1983 # Search for the real file URL
1984 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1985 if (mobj is None) or (mobj.group(1) is None):
1986 # Try to figure out reason of the error.
1987 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1988 if (mobj is not None) and (mobj.group(1) is not None):
1989 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1990 self._downloader.report_error(u'%s' % restriction_message)
1992 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
1995 file_url = mobj.group(1)
1996 file_extension = os.path.splitext(file_url)[1][1:]
1998 # Search for file title
1999 mobj = re.search(r'<b title="(.*?)">', webpage)
2001 self._downloader.report_error(u'unable to extract title')
2003 file_title = mobj.group(1).decode('utf-8')
2006 'id': file_id.decode('utf-8'),
2007 'url': file_url.decode('utf-8'),
2009 'upload_date': None,
2010 'title': file_title,
2011 'ext': file_extension.decode('utf-8'),
2015 class FacebookIE(InfoExtractor):
2016 """Information Extractor for Facebook"""
2018 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2019 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2020 _NETRC_MACHINE = 'facebook'
2021 IE_NAME = u'facebook'
2023 def report_login(self):
2024 """Report attempt to log in."""
2025 self.to_screen(u'Logging in')
2027 def _real_initialize(self):
2028 if self._downloader is None:
2033 downloader_params = self._downloader.params
2035 # Attempt to use provided username and password or .netrc data
2036 if downloader_params.get('username', None) is not None:
2037 useremail = downloader_params['username']
2038 password = downloader_params['password']
2039 elif downloader_params.get('usenetrc', False):
2041 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2042 if info is not None:
2046 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2047 except (IOError, netrc.NetrcParseError) as err:
2048 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2051 if useremail is None:
2060 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2063 login_results = compat_urllib_request.urlopen(request).read()
2064 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2065 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2067 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2068 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2071 def _real_extract(self, url):
2072 mobj = re.match(self._VALID_URL, url)
2074 self._downloader.report_error(u'invalid URL: %s' % url)
2076 video_id = mobj.group('ID')
2078 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2079 webpage = self._download_webpage(url, video_id)
2081 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2082 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2083 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2085 raise ExtractorError(u'Cannot parse data')
2086 data = dict(json.loads(m.group(1)))
2087 params_raw = compat_urllib_parse.unquote(data['params'])
2088 params = json.loads(params_raw)
2089 video_data = params['video_data'][0]
2090 video_url = video_data.get('hd_src')
2092 video_url = video_data['sd_src']
2094 raise ExtractorError(u'Cannot find video URL')
2095 video_duration = int(video_data['video_duration'])
2096 thumbnail = video_data['thumbnail_src']
2098 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2100 raise ExtractorError(u'Cannot find title in webpage')
2101 video_title = unescapeHTML(m.group(1))
2105 'title': video_title,
2108 'duration': video_duration,
2109 'thumbnail': thumbnail,
2114 class BlipTVIE(InfoExtractor):
2115 """Information extractor for blip.tv"""
2117 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2118 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2119 IE_NAME = u'blip.tv'
2121 def report_direct_download(self, title):
2122 """Report information extraction."""
2123 self.to_screen(u'%s: Direct download detected' % title)
2125 def _real_extract(self, url):
2126 mobj = re.match(self._VALID_URL, url)
2128 self._downloader.report_error(u'invalid URL: %s' % url)
2131 urlp = compat_urllib_parse_urlparse(url)
2132 if urlp.path.startswith('/play/'):
2133 request = compat_urllib_request.Request(url)
2134 response = compat_urllib_request.urlopen(request)
2135 redirecturl = response.geturl()
2136 rurlp = compat_urllib_parse_urlparse(redirecturl)
2137 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2138 url = 'http://blip.tv/a/a-' + file_id
2139 return self._real_extract(url)
2146 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2147 request = compat_urllib_request.Request(json_url)
2148 request.add_header('User-Agent', 'iTunes/10.6.1')
2149 self.report_extraction(mobj.group(1))
2152 urlh = compat_urllib_request.urlopen(request)
2153 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2154 basename = url.split('/')[-1]
2155 title,ext = os.path.splitext(basename)
2156 title = title.decode('UTF-8')
2157 ext = ext.replace('.', '')
2158 self.report_direct_download(title)
2163 'upload_date': None,
2168 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2169 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2170 if info is None: # Regular URL
2172 json_code_bytes = urlh.read()
2173 json_code = json_code_bytes.decode('utf-8')
2174 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2175 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2179 json_data = json.loads(json_code)
2180 if 'Post' in json_data:
2181 data = json_data['Post']
2185 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2186 video_url = data['media']['url']
2187 umobj = re.match(self._URL_EXT, video_url)
2189 raise ValueError('Can not determine filename extension')
2190 ext = umobj.group(1)
2193 'id': data['item_id'],
2195 'uploader': data['display_name'],
2196 'upload_date': upload_date,
2197 'title': data['title'],
2199 'format': data['media']['mimeType'],
2200 'thumbnail': data['thumbnailUrl'],
2201 'description': data['description'],
2202 'player_url': data['embedUrl'],
2203 'user_agent': 'iTunes/10.6.1',
2205 except (ValueError,KeyError) as err:
2206 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2212 class MyVideoIE(InfoExtractor):
2213 """Information Extractor for myvideo.de."""
2215 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2216 IE_NAME = u'myvideo'
2218 def _real_extract(self,url):
2219 mobj = re.match(self._VALID_URL, url)
2221 self._download.report_error(u'invalid URL: %s' % url)
2224 video_id = mobj.group(1)
2227 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2228 webpage = self._download_webpage(webpage_url, video_id)
2230 self.report_extraction(video_id)
2231 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2234 self._downloader.report_error(u'unable to extract media URL')
2236 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2238 mobj = re.search('<title>([^<]+)</title>', webpage)
2240 self._downloader.report_error(u'unable to extract title')
2243 video_title = mobj.group(1)
2249 'upload_date': None,
2250 'title': video_title,
2254 class ComedyCentralIE(InfoExtractor):
2255 """Information extractor for The Daily Show and Colbert Report """
2257 # urls can be abbreviations like :thedailyshow or :colbert
2258 # urls for episodes like:
2259 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2260 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2261 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2262 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2263 |(https?://)?(www\.)?
2264 (?P<showname>thedailyshow|colbertnation)\.com/
2265 (full-episodes/(?P<episode>.*)|
2267 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2268 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2271 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2273 _video_extensions = {
2281 _video_dimensions = {
2291 def suitable(cls, url):
2292 """Receives a URL and returns True if suitable for this IE."""
2293 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2295 def report_config_download(self, episode_id, media_id):
2296 self.to_screen(u'%s: Downloading configuration for %s' % (episode_id, media_id))
2298 def report_index_download(self, episode_id):
2299 self.to_screen(u'%s: Downloading show index' % episode_id)
2301 def _print_formats(self, formats):
2302 print('Available formats:')
2304 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2307 def _real_extract(self, url):
2308 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2310 self._downloader.report_error(u'invalid URL: %s' % url)
2313 if mobj.group('shortname'):
2314 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2315 url = u'http://www.thedailyshow.com/full-episodes/'
2317 url = u'http://www.colbertnation.com/full-episodes/'
2318 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2319 assert mobj is not None
2321 if mobj.group('clip'):
2322 if mobj.group('showname') == 'thedailyshow':
2323 epTitle = mobj.group('tdstitle')
2325 epTitle = mobj.group('cntitle')
2328 dlNewest = not mobj.group('episode')
2330 epTitle = mobj.group('showname')
2332 epTitle = mobj.group('episode')
2334 req = compat_urllib_request.Request(url)
2335 self.report_extraction(epTitle)
2337 htmlHandle = compat_urllib_request.urlopen(req)
2338 html = htmlHandle.read()
2339 webpage = html.decode('utf-8')
2340 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2341 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2344 url = htmlHandle.geturl()
2345 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2347 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2349 if mobj.group('episode') == '':
2350 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2352 epTitle = mobj.group('episode')
2354 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2356 if len(mMovieParams) == 0:
2357 # The Colbert Report embeds the information in a without
2358 # a URL prefix; so extract the alternate reference
2359 # and then add the URL prefix manually.
2361 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2362 if len(altMovieParams) == 0:
2363 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2366 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2368 uri = mMovieParams[0][1]
2369 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2370 self.report_index_download(epTitle)
2372 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2373 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2374 self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2379 idoc = xml.etree.ElementTree.fromstring(indexXml)
2380 itemEls = idoc.findall('.//item')
2381 for partNum,itemEl in enumerate(itemEls):
2382 mediaId = itemEl.findall('./guid')[0].text
2383 shortMediaId = mediaId.split(':')[-1]
2384 showId = mediaId.split(':')[-2].replace('.com', '')
2385 officialTitle = itemEl.findall('./title')[0].text
2386 officialDate = itemEl.findall('./pubDate')[0].text
2388 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2389 compat_urllib_parse.urlencode({'uri': mediaId}))
2390 configReq = compat_urllib_request.Request(configUrl)
2391 self.report_config_download(epTitle, shortMediaId)
2393 configXml = compat_urllib_request.urlopen(configReq).read()
2394 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2395 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2398 cdoc = xml.etree.ElementTree.fromstring(configXml)
2400 for rendition in cdoc.findall('.//rendition'):
2401 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2405 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2408 if self._downloader.params.get('listformats', None):
2409 self._print_formats([i[0] for i in turls])
2412 # For now, just pick the highest bitrate
2413 format,rtmp_video_url = turls[-1]
2415 # Get the format arg from the arg stream
2416 req_format = self._downloader.params.get('format', None)
2418 # Select format if we can find one
2421 format, rtmp_video_url = f, v
2424 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2426 raise ExtractorError(u'Cannot transform RTMP url')
2427 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2428 video_url = base + m.group('finalid')
2430 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2435 'upload_date': officialDate,
2440 'description': officialTitle,
2442 results.append(info)
2447 class EscapistIE(InfoExtractor):
2448 """Information extractor for The Escapist """
2450 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2451 IE_NAME = u'escapist'
2453 def report_config_download(self, showName):
2454 self.to_screen(u'%s: Downloading configuration' % showName)
2456 def _real_extract(self, url):
2457 mobj = re.match(self._VALID_URL, url)
2459 self._downloader.report_error(u'invalid URL: %s' % url)
2461 showName = mobj.group('showname')
2462 videoId = mobj.group('episode')
2464 self.report_extraction(showName)
2466 webPage = compat_urllib_request.urlopen(url)
2467 webPageBytes = webPage.read()
2468 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2469 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2470 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2471 self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2474 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2475 description = unescapeHTML(descMatch.group(1))
2476 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2477 imgUrl = unescapeHTML(imgMatch.group(1))
2478 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2479 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2480 configUrlMatch = re.search('config=(.*)$', playerUrl)
2481 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2483 self.report_config_download(showName)
2485 configJSON = compat_urllib_request.urlopen(configUrl)
2486 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2487 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2488 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2489 self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2492 # Technically, it's JavaScript, not JSON
2493 configJSON = configJSON.replace("'", '"')
2496 config = json.loads(configJSON)
2497 except (ValueError,) as err:
2498 self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2501 playlist = config['playlist']
2502 videoUrl = playlist[1]['url']
2507 'uploader': showName,
2508 'upload_date': None,
2511 'thumbnail': imgUrl,
2512 'description': description,
2513 'player_url': playerUrl,
2518 class CollegeHumorIE(InfoExtractor):
2519 """Information extractor for collegehumor.com"""
2522 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2523 IE_NAME = u'collegehumor'
2525 def report_manifest(self, video_id):
2526 """Report information extraction."""
2527 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2529 def _real_extract(self, url):
2530 mobj = re.match(self._VALID_URL, url)
2532 self._downloader.report_error(u'invalid URL: %s' % url)
2534 video_id = mobj.group('videoid')
2539 'upload_date': None,
2542 self.report_extraction(video_id)
2543 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2545 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2546 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2547 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2550 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2552 videoNode = mdoc.findall('./video')[0]
2553 info['description'] = videoNode.findall('./description')[0].text
2554 info['title'] = videoNode.findall('./caption')[0].text
2555 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2556 manifest_url = videoNode.findall('./file')[0].text
2558 self._downloader.report_error(u'Invalid metadata XML file')
2561 manifest_url += '?hdcore=2.10.3'
2562 self.report_manifest(video_id)
2564 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2565 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2566 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2569 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2571 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2572 node_id = media_node.attrib['url']
2573 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2574 except IndexError as err:
2575 self._downloader.report_error(u'Invalid manifest file')
2578 url_pr = compat_urllib_parse_urlparse(manifest_url)
2579 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2586 class XVideosIE(InfoExtractor):
2587 """Information extractor for xvideos.com"""
2589 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2590 IE_NAME = u'xvideos'
2592 def _real_extract(self, url):
2593 mobj = re.match(self._VALID_URL, url)
2595 self._downloader.report_error(u'invalid URL: %s' % url)
2597 video_id = mobj.group(1)
2599 webpage = self._download_webpage(url, video_id)
2601 self.report_extraction(video_id)
2605 mobj = re.search(r'flv_url=(.+?)&', webpage)
2607 self._downloader.report_error(u'unable to extract video url')
2609 video_url = compat_urllib_parse.unquote(mobj.group(1))
2613 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2615 self._downloader.report_error(u'unable to extract video title')
2617 video_title = mobj.group(1)
2620 # Extract video thumbnail
2621 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2623 self._downloader.report_error(u'unable to extract video thumbnail')
2625 video_thumbnail = mobj.group(0)
2631 'upload_date': None,
2632 'title': video_title,
2634 'thumbnail': video_thumbnail,
2635 'description': None,
2641 class SoundcloudIE(InfoExtractor):
2642 """Information extractor for soundcloud.com
2643 To access the media, the uid of the song and a stream token
2644 must be extracted from the page source and the script must make
2645 a request to media.soundcloud.com/crossdomain.xml. Then
2646 the media can be grabbed by requesting from an url composed
2647 of the stream token and uid
2650 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2651 IE_NAME = u'soundcloud'
2653 def report_resolve(self, video_id):
2654 """Report information extraction."""
2655 self.to_screen(u'%s: Resolving id' % video_id)
2657 def _real_extract(self, url):
2658 mobj = re.match(self._VALID_URL, url)
2660 self._downloader.report_error(u'invalid URL: %s' % url)
2663 # extract uploader (which is in the url)
2664 uploader = mobj.group(1)
2665 # extract simple title (uploader + slug of song title)
2666 slug_title = mobj.group(2)
2667 simple_title = uploader + u'-' + slug_title
2669 self.report_resolve('%s/%s' % (uploader, slug_title))
2671 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2672 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2673 request = compat_urllib_request.Request(resolv_url)
2675 info_json_bytes = compat_urllib_request.urlopen(request).read()
2676 info_json = info_json_bytes.decode('utf-8')
2677 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2678 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2681 info = json.loads(info_json)
2682 video_id = info['id']
2683 self.report_extraction('%s/%s' % (uploader, slug_title))
2685 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2686 request = compat_urllib_request.Request(streams_url)
2688 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2689 stream_json = stream_json_bytes.decode('utf-8')
2690 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2691 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2694 streams = json.loads(stream_json)
2695 mediaURL = streams['http_mp3_128_url']
2700 'uploader': info['user']['username'],
2701 'upload_date': info['created_at'],
2702 'title': info['title'],
2704 'description': info['description'],
2707 class SoundcloudSetIE(InfoExtractor):
2708 """Information extractor for soundcloud.com sets
2709 To access the media, the uid of the song and a stream token
2710 must be extracted from the page source and the script must make
2711 a request to media.soundcloud.com/crossdomain.xml. Then
2712 the media can be grabbed by requesting from an url composed
2713 of the stream token and uid
2716 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2717 IE_NAME = u'soundcloud'
2719 def report_resolve(self, video_id):
2720 """Report information extraction."""
2721 self.to_screen(u'%s: Resolving id' % video_id)
2723 def _real_extract(self, url):
2724 mobj = re.match(self._VALID_URL, url)
2726 self._downloader.report_error(u'invalid URL: %s' % url)
2729 # extract uploader (which is in the url)
2730 uploader = mobj.group(1)
2731 # extract simple title (uploader + slug of song title)
2732 slug_title = mobj.group(2)
2733 simple_title = uploader + u'-' + slug_title
2735 self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2737 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2738 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2739 request = compat_urllib_request.Request(resolv_url)
2741 info_json_bytes = compat_urllib_request.urlopen(request).read()
2742 info_json = info_json_bytes.decode('utf-8')
2743 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2744 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2748 info = json.loads(info_json)
2749 if 'errors' in info:
2750 for err in info['errors']:
2751 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2754 for track in info['tracks']:
2755 video_id = track['id']
2756 self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2758 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2759 request = compat_urllib_request.Request(streams_url)
2761 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2762 stream_json = stream_json_bytes.decode('utf-8')
2763 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2764 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2767 streams = json.loads(stream_json)
2768 mediaURL = streams['http_mp3_128_url']
2773 'uploader': track['user']['username'],
2774 'upload_date': track['created_at'],
2775 'title': track['title'],
2777 'description': track['description'],
2782 class InfoQIE(InfoExtractor):
2783 """Information extractor for infoq.com"""
2784 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2786 def _real_extract(self, url):
2787 mobj = re.match(self._VALID_URL, url)
2789 self._downloader.report_error(u'invalid URL: %s' % url)
2792 webpage = self._download_webpage(url, video_id=url)
2793 self.report_extraction(url)
2796 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2798 self._downloader.report_error(u'unable to extract video url')
2800 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2801 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2804 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2806 self._downloader.report_error(u'unable to extract video title')
2808 video_title = mobj.group(1)
2810 # Extract description
2811 video_description = u'No description available.'
2812 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2813 if mobj is not None:
2814 video_description = mobj.group(1)
2816 video_filename = video_url.split('/')[-1]
2817 video_id, extension = video_filename.split('.')
2823 'upload_date': None,
2824 'title': video_title,
2825 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2827 'description': video_description,
2832 class MixcloudIE(InfoExtractor):
2833 """Information extractor for www.mixcloud.com"""
2835 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2836 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2837 IE_NAME = u'mixcloud'
2839 def report_download_json(self, file_id):
2840 """Report JSON download."""
2841 self.to_screen(u'Downloading json')
2843 def get_urls(self, jsonData, fmt, bitrate='best'):
2844 """Get urls from 'audio_formats' section in json"""
2847 bitrate_list = jsonData[fmt]
2848 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2849 bitrate = max(bitrate_list) # select highest
2851 url_list = jsonData[fmt][bitrate]
2852 except TypeError: # we have no bitrate info.
2853 url_list = jsonData[fmt]
2856 def check_urls(self, url_list):
2857 """Returns 1st active url from list"""
2858 for url in url_list:
2860 compat_urllib_request.urlopen(url)
2862 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2867 def _print_formats(self, formats):
2868 print('Available formats:')
2869 for fmt in formats.keys():
2870 for b in formats[fmt]:
2872 ext = formats[fmt][b][0]
2873 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2874 except TypeError: # we have no bitrate info
2875 ext = formats[fmt][0]
2876 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2879 def _real_extract(self, url):
2880 mobj = re.match(self._VALID_URL, url)
2882 self._downloader.report_error(u'invalid URL: %s' % url)
2884 # extract uploader & filename from url
2885 uploader = mobj.group(1).decode('utf-8')
2886 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2888 # construct API request
2889 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2890 # retrieve .json file with links to files
2891 request = compat_urllib_request.Request(file_url)
2893 self.report_download_json(file_url)
2894 jsonData = compat_urllib_request.urlopen(request).read()
2895 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2896 self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
2900 json_data = json.loads(jsonData)
2901 player_url = json_data['player_swf_url']
2902 formats = dict(json_data['audio_formats'])
2904 req_format = self._downloader.params.get('format', None)
2907 if self._downloader.params.get('listformats', None):
2908 self._print_formats(formats)
2911 if req_format is None or req_format == 'best':
2912 for format_param in formats.keys():
2913 url_list = self.get_urls(formats, format_param)
2915 file_url = self.check_urls(url_list)
2916 if file_url is not None:
2919 if req_format not in formats:
2920 self._downloader.report_error(u'format is not available')
2923 url_list = self.get_urls(formats, req_format)
2924 file_url = self.check_urls(url_list)
2925 format_param = req_format
2928 'id': file_id.decode('utf-8'),
2929 'url': file_url.decode('utf-8'),
2930 'uploader': uploader.decode('utf-8'),
2931 'upload_date': None,
2932 'title': json_data['name'],
2933 'ext': file_url.split('.')[-1].decode('utf-8'),
2934 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2935 'thumbnail': json_data['thumbnail_url'],
2936 'description': json_data['description'],
2937 'player_url': player_url.decode('utf-8'),
2940 class StanfordOpenClassroomIE(InfoExtractor):
2941 """Information extractor for Stanford's Open ClassRoom"""
2943 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2944 IE_NAME = u'stanfordoc'
2946 def _real_extract(self, url):
2947 mobj = re.match(self._VALID_URL, url)
2949 raise ExtractorError(u'Invalid URL: %s' % url)
2951 if mobj.group('course') and mobj.group('video'): # A specific video
2952 course = mobj.group('course')
2953 video = mobj.group('video')
2955 'id': course + '_' + video,
2957 'upload_date': None,
2960 self.report_extraction(info['id'])
2961 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2962 xmlUrl = baseUrl + video + '.xml'
2964 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2965 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2966 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2968 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2970 info['title'] = mdoc.findall('./title')[0].text
2971 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2973 self._downloader.report_error(u'Invalid metadata XML file')
2975 info['ext'] = info['url'].rpartition('.')[2]
2977 elif mobj.group('course'): # A course page
2978 course = mobj.group('course')
2983 'upload_date': None,
2986 coursepage = self._download_webpage(url, info['id'],
2987 note='Downloading course info page',
2988 errnote='Unable to download course info page')
2990 m = re.search('<h1>([^<]+)</h1>', coursepage)
2992 info['title'] = unescapeHTML(m.group(1))
2994 info['title'] = info['id']
2996 m = re.search('<description>([^<]+)</description>', coursepage)
2998 info['description'] = unescapeHTML(m.group(1))
3000 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3003 'type': 'reference',
3004 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3008 for entry in info['list']:
3009 assert entry['type'] == 'reference'
3010 results += self.extract(entry['url'])
3014 'id': 'Stanford OpenClassroom',
3017 'upload_date': None,
3020 self.report_download_webpage(info['id'])
3021 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3023 rootpage = compat_urllib_request.urlopen(rootURL).read()
3024 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3025 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3028 info['title'] = info['id']
3030 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3033 'type': 'reference',
3034 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3039 for entry in info['list']:
3040 assert entry['type'] == 'reference'
3041 results += self.extract(entry['url'])
3044 class MTVIE(InfoExtractor):
3045 """Information extractor for MTV.com"""
3047 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3050 def _real_extract(self, url):
3051 mobj = re.match(self._VALID_URL, url)
3053 self._downloader.report_error(u'invalid URL: %s' % url)
3055 if not mobj.group('proto'):
3056 url = 'http://' + url
3057 video_id = mobj.group('videoid')
3059 webpage = self._download_webpage(url, video_id)
3061 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3063 self._downloader.report_error(u'unable to extract song name')
3065 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3066 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3068 self._downloader.report_error(u'unable to extract performer')
3070 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3071 video_title = performer + ' - ' + song_name
3073 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3075 self._downloader.report_error(u'unable to mtvn_uri')
3077 mtvn_uri = mobj.group(1)
3079 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3081 self._downloader.report_error(u'unable to extract content id')
3083 content_id = mobj.group(1)
3085 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3086 self.report_extraction(video_id)
3087 request = compat_urllib_request.Request(videogen_url)
3089 metadataXml = compat_urllib_request.urlopen(request).read()
3090 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3091 self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3094 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3095 renditions = mdoc.findall('.//rendition')
3097 # For now, always pick the highest quality.
3098 rendition = renditions[-1]
3101 _,_,ext = rendition.attrib['type'].partition('/')
3102 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3103 video_url = rendition.find('./src').text
3105 self._downloader.report_error('Invalid rendition field.')
3111 'uploader': performer,
3112 'upload_date': None,
3113 'title': video_title,
3121 class YoukuIE(InfoExtractor):
3122 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3125 nowTime = int(time.time() * 1000)
3126 random1 = random.randint(1000,1998)
3127 random2 = random.randint(1000,9999)
3129 return "%d%d%d" %(nowTime,random1,random2)
3131 def _get_file_ID_mix_string(self, seed):
3133 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3135 for i in range(len(source)):
3136 seed = (seed * 211 + 30031 ) % 65536
3137 index = math.floor(seed / 65536 * len(source) )
3138 mixed.append(source[int(index)])
3139 source.remove(source[int(index)])
3140 #return ''.join(mixed)
3143 def _get_file_id(self, fileId, seed):
3144 mixed = self._get_file_ID_mix_string(seed)
3145 ids = fileId.split('*')
3149 realId.append(mixed[int(ch)])
3150 return ''.join(realId)
3152 def _real_extract(self, url):
3153 mobj = re.match(self._VALID_URL, url)
3155 self._downloader.report_error(u'invalid URL: %s' % url)
3157 video_id = mobj.group('ID')
3159 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3161 request = compat_urllib_request.Request(info_url, None, std_headers)
3163 self.report_download_webpage(video_id)
3164 jsondata = compat_urllib_request.urlopen(request).read()
3165 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3166 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3169 self.report_extraction(video_id)
3171 jsonstr = jsondata.decode('utf-8')
3172 config = json.loads(jsonstr)
3174 video_title = config['data'][0]['title']
3175 seed = config['data'][0]['seed']
3177 format = self._downloader.params.get('format', None)
3178 supported_format = list(config['data'][0]['streamfileids'].keys())
3180 if format is None or format == 'best':
3181 if 'hd2' in supported_format:
3186 elif format == 'worst':
3194 fileid = config['data'][0]['streamfileids'][format]
3195 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3196 except (UnicodeDecodeError, ValueError, KeyError):
3197 self._downloader.report_error(u'unable to extract info section')
3201 sid = self._gen_sid()
3202 fileid = self._get_file_id(fileid, seed)
3204 #column 8,9 of fileid represent the segment number
3205 #fileid[7:9] should be changed
3206 for index, key in enumerate(keys):
3208 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3209 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3212 'id': '%s_part%02d' % (video_id, index),
3213 'url': download_url,
3215 'upload_date': None,
3216 'title': video_title,
3219 files_info.append(info)
3224 class XNXXIE(InfoExtractor):
3225 """Information extractor for xnxx.com"""
3227 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3229 VIDEO_URL_RE = r'flv_url=(.*?)&'
3230 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3231 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3233 def _real_extract(self, url):
3234 mobj = re.match(self._VALID_URL, url)
3236 self._downloader.report_error(u'invalid URL: %s' % url)
3238 video_id = mobj.group(1)
3240 self.report_download_webpage(video_id)
3242 # Get webpage content
3244 webpage_bytes = compat_urllib_request.urlopen(url).read()
3245 webpage = webpage_bytes.decode('utf-8')
3246 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3247 self._downloader.report_error(u'unable to download video webpage: %s' % err)
3250 result = re.search(self.VIDEO_URL_RE, webpage)
3252 self._downloader.report_error(u'unable to extract video url')
3254 video_url = compat_urllib_parse.unquote(result.group(1))
3256 result = re.search(self.VIDEO_TITLE_RE, webpage)
3258 self._downloader.report_error(u'unable to extract video title')
3260 video_title = result.group(1)
3262 result = re.search(self.VIDEO_THUMB_RE, webpage)
3264 self._downloader.report_error(u'unable to extract video thumbnail')
3266 video_thumbnail = result.group(1)
3272 'upload_date': None,
3273 'title': video_title,
3275 'thumbnail': video_thumbnail,
3276 'description': None,
3280 class GooglePlusIE(InfoExtractor):
3281 """Information extractor for plus.google.com."""
3283 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3284 IE_NAME = u'plus.google'
3286 def report_extract_entry(self, url):
3287 """Report downloading extry"""
3288 self.to_screen(u'Downloading entry: %s' % url)
3290 def report_date(self, upload_date):
3291 """Report downloading extry"""
3292 self.to_screen(u'Entry date: %s' % upload_date)
3294 def report_uploader(self, uploader):
3295 """Report downloading extry"""
3296 self.to_screen(u'Uploader: %s' % uploader)
3298 def report_title(self, video_title):
3299 """Report downloading extry"""
3300 self.to_screen(u'Title: %s' % video_title)
3302 def report_extract_vid_page(self, video_page):
3303 """Report information extraction."""
3304 self.to_screen(u'Extracting video page: %s' % video_page)
3306 def _real_extract(self, url):
3307 # Extract id from URL
3308 mobj = re.match(self._VALID_URL, url)
3310 self._downloader.report_error(u'Invalid URL: %s' % url)
3313 post_url = mobj.group(0)
3314 video_id = mobj.group(1)
3316 video_extension = 'flv'
3318 # Step 1, Retrieve post webpage to extract further information
3319 self.report_extract_entry(post_url)
3320 request = compat_urllib_request.Request(post_url)
3322 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3323 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3324 self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3327 # Extract update date
3329 pattern = 'title="Timestamp">(.*?)</a>'
3330 mobj = re.search(pattern, webpage)
3332 upload_date = mobj.group(1)
3333 # Convert timestring to a format suitable for filename
3334 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3335 upload_date = upload_date.strftime('%Y%m%d')
3336 self.report_date(upload_date)
3340 pattern = r'rel\="author".*?>(.*?)</a>'
3341 mobj = re.search(pattern, webpage)
3343 uploader = mobj.group(1)
3344 self.report_uploader(uploader)
3347 # Get the first line for title
3349 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3350 mobj = re.search(pattern, webpage)
3352 video_title = mobj.group(1)
3353 self.report_title(video_title)
3355 # Step 2, Stimulate clicking the image box to launch video
3356 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3357 mobj = re.search(pattern, webpage)
3359 self._downloader.report_error(u'unable to extract video page URL')
3361 video_page = mobj.group(1)
3362 request = compat_urllib_request.Request(video_page)
3364 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3365 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3366 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3368 self.report_extract_vid_page(video_page)
3371 # Extract video links on video page
3372 """Extract video links of all sizes"""
3373 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3374 mobj = re.findall(pattern, webpage)
3376 self._downloader.report_error(u'unable to extract video links')
3378 # Sort in resolution
3379 links = sorted(mobj)
3381 # Choose the lowest of the sort, i.e. highest resolution
3382 video_url = links[-1]
3383 # Only get the url. The resolution part in the tuple has no use anymore
3384 video_url = video_url[-1]
3385 # Treat escaped \u0026 style hex
3387 video_url = video_url.decode("unicode_escape")
3388 except AttributeError: # Python 3
3389 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3395 'uploader': uploader,
3396 'upload_date': upload_date,
3397 'title': video_title,
3398 'ext': video_extension,
3401 class NBAIE(InfoExtractor):
3402 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3405 def _real_extract(self, url):
3406 mobj = re.match(self._VALID_URL, url)
3408 self._downloader.report_error(u'invalid URL: %s' % url)
3411 video_id = mobj.group(1)
3412 if video_id.endswith('/index.html'):
3413 video_id = video_id[:-len('/index.html')]
3415 webpage = self._download_webpage(url, video_id)
3417 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3418 def _findProp(rexp, default=None):
3419 m = re.search(rexp, webpage)
3421 return unescapeHTML(m.group(1))
3425 shortened_video_id = video_id.rpartition('/')[2]
3426 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3428 'id': shortened_video_id,
3432 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3433 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3437 class JustinTVIE(InfoExtractor):
3438 """Information extractor for justin.tv and twitch.tv"""
3439 # TODO: One broadcast may be split into multiple videos. The key
3440 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3441 # starts at 1 and increases. Can we treat all parts as one video?
3443 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3444 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3445 _JUSTIN_PAGE_LIMIT = 100
3446 IE_NAME = u'justin.tv'
3448 def report_download_page(self, channel, offset):
3449 """Report attempt to download a single page of videos."""
3450 self.to_screen(u'%s: Downloading video information from %d to %d' %
3451 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3453 # Return count of items, list of *valid* items
3454 def _parse_page(self, url):
3456 urlh = compat_urllib_request.urlopen(url)
3457 webpage_bytes = urlh.read()
3458 webpage = webpage_bytes.decode('utf-8', 'ignore')
3459 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3460 self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3463 response = json.loads(webpage)
3464 if type(response) != list:
3465 error_text = response.get('error', 'unknown error')
3466 self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3469 for clip in response:
3470 video_url = clip['video_file_url']
3472 video_extension = os.path.splitext(video_url)[1][1:]
3473 video_date = re.sub('-', '', clip['start_time'][:10])
3474 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3475 video_id = clip['id']
3476 video_title = clip.get('title', video_id)
3480 'title': video_title,
3481 'uploader': clip.get('channel_name', video_uploader_id),
3482 'uploader_id': video_uploader_id,
3483 'upload_date': video_date,
3484 'ext': video_extension,
3486 return (len(response), info)
3488 def _real_extract(self, url):
3489 mobj = re.match(self._VALID_URL, url)
3491 self._downloader.report_error(u'invalid URL: %s' % url)
3494 api = 'http://api.justin.tv'
3495 video_id = mobj.group(mobj.lastindex)
3497 if mobj.lastindex == 1:
3499 api += '/channel/archives/%s.json'
3501 api += '/broadcast/by_archive/%s.json'
3502 api = api % (video_id,)
3504 self.report_extraction(video_id)
3508 limit = self._JUSTIN_PAGE_LIMIT
3511 self.report_download_page(video_id, offset)
3512 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3513 page_count, page_info = self._parse_page(page_url)
3514 info.extend(page_info)
3515 if not paged or page_count != limit:
3520 class FunnyOrDieIE(InfoExtractor):
3521 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3523 def _real_extract(self, url):
3524 mobj = re.match(self._VALID_URL, url)
3526 self._downloader.report_error(u'invalid URL: %s' % url)
3529 video_id = mobj.group('id')
3530 webpage = self._download_webpage(url, video_id)
3532 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3534 self._downloader.report_error(u'unable to find video information')
3535 video_url = unescapeHTML(m.group('url'))
3537 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3539 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3541 self._downloader.report_error(u'Cannot find video title')
3542 title = clean_html(m.group('title'))
3544 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3546 desc = unescapeHTML(m.group('desc'))
3555 'description': desc,
3559 class SteamIE(InfoExtractor):
3560 _VALID_URL = r"""http://store.steampowered.com/
3561 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3563 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3567 def suitable(cls, url):
3568 """Receives a URL and returns True if suitable for this IE."""
3569 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3571 def _real_extract(self, url):
3572 m = re.match(self._VALID_URL, url, re.VERBOSE)
3573 gameID = m.group('gameID')
3574 videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3575 self.report_age_confirmation()
3576 webpage = self._download_webpage(videourl, gameID)
3577 game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3579 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3580 mweb = re.finditer(urlRE, webpage)
3581 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3582 titles = re.finditer(namesRE, webpage)
3583 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3584 thumbs = re.finditer(thumbsRE, webpage)
3586 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3587 video_id = vid.group('videoID')
3588 title = vtitle.group('videoName')
3589 video_url = vid.group('videoURL')
3590 video_thumb = thumb.group('thumbnail')
3592 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3597 'title': unescapeHTML(title),
3598 'thumbnail': video_thumb
3601 return [self.playlist_result(videos, gameID, game_title)]
3603 class UstreamIE(InfoExtractor):
3604 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3605 IE_NAME = u'ustream'
3607 def _real_extract(self, url):
3608 m = re.match(self._VALID_URL, url)
3609 video_id = m.group('videoID')
3610 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3611 webpage = self._download_webpage(url, video_id)
3612 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3613 title = m.group('title')
3614 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3615 uploader = m.group('uploader')
3621 'uploader': uploader
3625 class WorldStarHipHopIE(InfoExtractor):
3626 _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3627 IE_NAME = u'WorldStarHipHop'
3629 def _real_extract(self, url):
3630 _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3632 webpage_src = compat_urllib_request.urlopen(url).read()
3633 webpage_src = webpage_src.decode('utf-8')
3635 mobj = re.search(_src_url, webpage_src)
3637 m = re.match(self._VALID_URL, url)
3638 video_id = m.group('id')
3640 if mobj is not None:
3641 video_url = mobj.group()
3642 if 'mp4' in video_url:
3647 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3650 _title = r"""<title>(.*)</title>"""
3652 mobj = re.search(_title, webpage_src)
3654 if mobj is not None:
3655 title = mobj.group(1)
3657 title = 'World Start Hip Hop - %s' % time.ctime()
3659 _thumbnail = r"""rel="image_src" href="(.*)" />"""
3660 mobj = re.search(_thumbnail, webpage_src)
3662 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3663 if mobj is not None:
3664 thumbnail = mobj.group(1)
3666 _title = r"""candytitles.*>(.*)</span>"""
3667 mobj = re.search(_title, webpage_src)
3668 if mobj is not None:
3669 title = mobj.group(1)
3676 'thumbnail' : thumbnail,
3681 class RBMARadioIE(InfoExtractor):
3682 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3684 def _real_extract(self, url):
3685 m = re.match(self._VALID_URL, url)
3686 video_id = m.group('videoID')
3688 webpage = self._download_webpage(url, video_id)
3689 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3691 raise ExtractorError(u'Cannot find metadata')
3692 json_data = m.group(1)
3695 data = json.loads(json_data)
3696 except ValueError as e:
3697 raise ExtractorError(u'Invalid JSON: ' + str(e))
3699 video_url = data['akamai_url'] + '&cbr=256'
3700 url_parts = compat_urllib_parse_urlparse(video_url)
3701 video_ext = url_parts.path.rpartition('.')[2]
3706 'title': data['title'],
3707 'description': data.get('teaser_text'),
3708 'location': data.get('country_of_origin'),
3709 'uploader': data.get('host', {}).get('name'),
3710 'uploader_id': data.get('host', {}).get('slug'),
3711 'thumbnail': data.get('image', {}).get('large_url_2x'),
3712 'duration': data.get('duration'),
3717 class YouPornIE(InfoExtractor):
3718 """Information extractor for youporn.com."""
3719 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3721 def _print_formats(self, formats):
3722 """Print all available formats"""
3723 print(u'Available formats:')
3724 print(u'ext\t\tformat')
3725 print(u'---------------------------------')
3726 for format in formats:
3727 print(u'%s\t\t%s' % (format['ext'], format['format']))
3729 def _specific(self, req_format, formats):
3731 if(x["format"]==req_format):
3735 def _real_extract(self, url):
3736 mobj = re.match(self._VALID_URL, url)
3738 self._downloader.report_error(u'invalid URL: %s' % url)
3741 video_id = mobj.group('videoid')
3743 req = compat_urllib_request.Request(url)
3744 req.add_header('Cookie', 'age_verified=1')
3745 webpage = self._download_webpage(req, video_id)
3747 # Get the video title
3748 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3750 raise ExtractorError(u'Unable to extract video title')
3751 video_title = result.group('title').strip()
3753 # Get the video date
3754 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3756 self._downloader.report_warning(u'unable to extract video date')
3759 upload_date = result.group('date').strip()
3761 # Get the video uploader
3762 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3764 self._downloader.report_warning(u'unable to extract uploader')
3765 video_uploader = None
3767 video_uploader = result.group('uploader').strip()
3768 video_uploader = clean_html( video_uploader )
3770 # Get all of the formats available
3771 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3772 result = re.search(DOWNLOAD_LIST_RE, webpage)
3774 raise ExtractorError(u'Unable to extract download list')
3775 download_list_html = result.group('download_list').strip()
3777 # Get all of the links from the page
3778 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3779 links = re.findall(LINK_RE, download_list_html)
3780 if(len(links) == 0):
3781 raise ExtractorError(u'ERROR: no known formats available for video')
3783 self.to_screen(u'Links found: %d' % len(links))
3788 # A link looks like this:
3789 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3790 # A path looks like this:
3791 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3792 video_url = unescapeHTML( link )
3793 path = compat_urllib_parse_urlparse( video_url ).path
3794 extension = os.path.splitext( path )[1][1:]
3795 format = path.split('/')[4].split('_')[:2]
3798 format = "-".join( format )
3799 title = u'%s-%s-%s' % (video_title, size, bitrate)
3804 'uploader': video_uploader,
3805 'upload_date': upload_date,
3810 'description': None,
3814 if self._downloader.params.get('listformats', None):
3815 self._print_formats(formats)
3818 req_format = self._downloader.params.get('format', None)
3819 self.to_screen(u'Format: %s' % req_format)
3821 if req_format is None or req_format == 'best':
3823 elif req_format == 'worst':
3824 return [formats[-1]]
3825 elif req_format in ('-1', 'all'):
3828 format = self._specific( req_format, formats )
3830 self._downloader.report_error(u'requested format not available')
3836 class PornotubeIE(InfoExtractor):
3837 """Information extractor for pornotube.com."""
3838 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3840 def _real_extract(self, url):
3841 mobj = re.match(self._VALID_URL, url)
3843 self._downloader.report_error(u'invalid URL: %s' % url)
3846 video_id = mobj.group('videoid')
3847 video_title = mobj.group('title')
3849 # Get webpage content
3850 webpage = self._download_webpage(url, video_id)
3853 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3854 result = re.search(VIDEO_URL_RE, webpage)
3856 self._downloader.report_error(u'unable to extract video url')
3858 video_url = compat_urllib_parse.unquote(result.group('url'))
3860 #Get the uploaded date
3861 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3862 result = re.search(VIDEO_UPLOADED_RE, webpage)
3864 self._downloader.report_error(u'unable to extract video title')
3866 upload_date = result.group('date')
3868 info = {'id': video_id,
3871 'upload_date': upload_date,
3872 'title': video_title,
3878 class YouJizzIE(InfoExtractor):
3879 """Information extractor for youjizz.com."""
3880 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3882 def _real_extract(self, url):
3883 mobj = re.match(self._VALID_URL, url)
3885 self._downloader.report_error(u'invalid URL: %s' % url)
3888 video_id = mobj.group('videoid')
3890 # Get webpage content
3891 webpage = self._download_webpage(url, video_id)
3893 # Get the video title
3894 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3896 raise ExtractorError(u'ERROR: unable to extract video title')
3897 video_title = result.group('title').strip()
3899 # Get the embed page
3900 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3902 raise ExtractorError(u'ERROR: unable to extract embed page')
3904 embed_page_url = result.group(0).strip()
3905 video_id = result.group('videoid')
3907 webpage = self._download_webpage(embed_page_url, video_id)
3910 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3912 raise ExtractorError(u'ERROR: unable to extract video url')
3913 video_url = result.group('source')
3915 info = {'id': video_id,
3917 'title': video_title,
3920 'player_url': embed_page_url}
3924 class EightTracksIE(InfoExtractor):
3926 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3928 def _real_extract(self, url):
3929 mobj = re.match(self._VALID_URL, url)
3931 raise ExtractorError(u'Invalid URL: %s' % url)
3932 playlist_id = mobj.group('id')
3934 webpage = self._download_webpage(url, playlist_id)
3936 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3938 raise ExtractorError(u'Cannot find trax information')
3939 json_like = m.group(1)
3940 data = json.loads(json_like)
3942 session = str(random.randint(0, 1000000000))
3944 track_count = data['tracks_count']
3945 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3946 next_url = first_url
3948 for i in itertools.count():
3949 api_json = self._download_webpage(next_url, playlist_id,
3950 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3951 errnote=u'Failed to download song information')
3952 api_data = json.loads(api_json)
3953 track_data = api_data[u'set']['track']
3955 'id': track_data['id'],
3956 'url': track_data['track_file_stream_url'],
3957 'title': track_data['performer'] + u' - ' + track_data['name'],
3958 'raw_title': track_data['name'],
3959 'uploader_id': data['user']['login'],
3963 if api_data['set']['at_last_track']:
3965 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3968 class KeekIE(InfoExtractor):
3969 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3972 def _real_extract(self, url):
3973 m = re.match(self._VALID_URL, url)
3974 video_id = m.group('videoID')
3975 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3976 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3977 webpage = self._download_webpage(url, video_id)
3978 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3979 title = unescapeHTML(m.group('title'))
3980 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3981 uploader = clean_html(m.group('uploader'))
3987 'thumbnail': thumbnail,
3988 'uploader': uploader
3992 class TEDIE(InfoExtractor):
3993 _VALID_URL=r'''http://www.ted.com/
3995 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3997 ((?P<type_talk>talks)) # We have a simple talk
3999 /(?P<name>\w+) # Here goes the name and then ".html"
4003 def suitable(cls, url):
4004 """Receives a URL and returns True if suitable for this IE."""
4005 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4007 def _real_extract(self, url):
4008 m=re.match(self._VALID_URL, url, re.VERBOSE)
4009 if m.group('type_talk'):
4010 return [self._talk_info(url)]
4012 playlist_id=m.group('playlist_id')
4013 name=m.group('name')
4014 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
4015 return [self._playlist_videos_info(url,name,playlist_id)]
4017 def _talk_video_link(self,mediaSlug):
4018 '''Returns the video link for that mediaSlug'''
4019 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4021 def _playlist_videos_info(self,url,name,playlist_id=0):
4022 '''Returns the videos of the playlist'''
4024 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4025 ([.\s]*?)data-playlist_item_id="(\d+)"
4026 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4028 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4029 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4030 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4031 m_names=re.finditer(video_name_RE,webpage)
4033 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
4034 m_playlist = re.search(playlist_RE, webpage)
4035 playlist_title = m_playlist.group('playlist_title')
4037 playlist_entries = []
4038 for m_video, m_name in zip(m_videos,m_names):
4039 video_id=m_video.group('video_id')
4040 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4041 playlist_entries.append(self.url_result(talk_url, 'TED'))
4042 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
4044 def _talk_info(self, url, video_id=0):
4045 """Return the video for the talk in the url"""
4046 m=re.match(self._VALID_URL, url,re.VERBOSE)
4047 videoName=m.group('name')
4048 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4049 # If the url includes the language we get the title translated
4050 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4051 title=re.search(title_RE, webpage).group('title')
4052 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4053 "id":(?P<videoID>[\d]+).*?
4054 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4055 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4056 thumb_match=re.search(thumb_RE,webpage)
4057 info_match=re.search(info_RE,webpage,re.VERBOSE)
4058 video_id=info_match.group('videoID')
4059 mediaSlug=info_match.group('mediaSlug')
4060 video_url=self._talk_video_link(mediaSlug)
4066 'thumbnail': thumb_match.group('thumbnail')
4070 class MySpassIE(InfoExtractor):
4071 _VALID_URL = r'http://www.myspass.de/.*'
4073 def _real_extract(self, url):
4074 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4076 # video id is the last path element of the URL
4077 # usually there is a trailing slash, so also try the second but last
4078 url_path = compat_urllib_parse_urlparse(url).path
4079 url_parent_path, video_id = os.path.split(url_path)
4081 _, video_id = os.path.split(url_parent_path)
4084 metadata_url = META_DATA_URL_TEMPLATE % video_id
4085 metadata_text = self._download_webpage(metadata_url, video_id)
4086 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4088 # extract values from metadata
4089 url_flv_el = metadata.find('url_flv')
4090 if url_flv_el is None:
4091 self._downloader.report_error(u'unable to extract download url')
4093 video_url = url_flv_el.text
4094 extension = os.path.splitext(video_url)[1][1:]
4095 title_el = metadata.find('title')
4096 if title_el is None:
4097 self._downloader.report_error(u'unable to extract title')
4099 title = title_el.text
4100 format_id_el = metadata.find('format_id')
4101 if format_id_el is None:
4104 format = format_id_el.text
4105 description_el = metadata.find('description')
4106 if description_el is not None:
4107 description = description_el.text
4110 imagePreview_el = metadata.find('imagePreview')
4111 if imagePreview_el is not None:
4112 thumbnail = imagePreview_el.text
4121 'thumbnail': thumbnail,
4122 'description': description
4126 class SpiegelIE(InfoExtractor):
4127 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4129 def _real_extract(self, url):
4130 m = re.match(self._VALID_URL, url)
4131 video_id = m.group('videoID')
4133 webpage = self._download_webpage(url, video_id)
4134 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4136 raise ExtractorError(u'Cannot find title')
4137 video_title = unescapeHTML(m.group(1))
4139 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4140 xml_code = self._download_webpage(xml_url, video_id,
4141 note=u'Downloading XML', errnote=u'Failed to download XML')
4143 idoc = xml.etree.ElementTree.fromstring(xml_code)
4144 last_type = idoc[-1]
4145 filename = last_type.findall('./filename')[0].text
4146 duration = float(last_type.findall('./duration')[0].text)
4148 video_url = 'http://video2.spiegel.de/flash/' + filename
4149 video_ext = filename.rpartition('.')[2]
4154 'title': video_title,
4155 'duration': duration,
4159 class LiveLeakIE(InfoExtractor):
4161 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4162 IE_NAME = u'liveleak'
4164 def _real_extract(self, url):
4165 mobj = re.match(self._VALID_URL, url)
4167 self._downloader.report_error(u'invalid URL: %s' % url)
4170 video_id = mobj.group('video_id')
4172 webpage = self._download_webpage(url, video_id)
4174 m = re.search(r'file: "(.*?)",', webpage)
4176 self._downloader.report_error(u'unable to find video url')
4178 video_url = m.group(1)
4180 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4182 self._downloader.report_error(u'Cannot find video title')
4183 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4185 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4187 desc = unescapeHTML(m.group('desc'))
4191 m = re.search(r'By:.*?(\w+)</a>', webpage)
4193 uploader = clean_html(m.group(1))
4202 'description': desc,
4203 'uploader': uploader
4208 class ARDIE(InfoExtractor):
4209 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4210 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4211 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4213 def _real_extract(self, url):
4214 # determine video id from url
4215 m = re.match(self._VALID_URL, url)
4217 numid = re.search(r'documentId=([0-9]+)', url)
4219 video_id = numid.group(1)
4221 video_id = m.group('video_id')
4223 # determine title and media streams from webpage
4224 html = self._download_webpage(url, video_id)
4225 title = re.search(self._TITLE, html).group('title')
4226 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4228 assert '"fsk"' in html
4229 self._downloader.report_error(u'this video is only available after 8:00 pm')
4232 # choose default media type and highest quality for now
4233 stream = max([s for s in streams if int(s["media_type"]) == 0],
4234 key=lambda s: int(s["quality"]))
4236 # there's two possibilities: RTMP stream or HTTP download
4237 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4238 if stream['rtmp_url']:
4239 self.to_screen(u'RTMP download detected')
4240 assert stream['video_url'].startswith('mp4:')
4241 info["url"] = stream["rtmp_url"]
4242 info["play_path"] = stream['video_url']
4244 assert stream["video_url"].endswith('.mp4')
4245 info["url"] = stream["video_url"]
4248 class TumblrIE(InfoExtractor):
4249 _VALID_URL = r'http://(?P<blog_name>.*?).tumblr.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4251 def _real_extract(self, url):
4252 m_url = re.match(self._VALID_URL, url)
4253 video_id = m_url.group('id')
4254 blog = m_url.group('blog_name')
4256 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4257 webpage = self._download_webpage(url, video_id)
4259 re_video = r'src=\\x22(?P<video_url>http://%s.tumblr.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4260 video = re.search(re_video, webpage)
4262 self.to_screen("No video founded")
4264 video_url = video.group('video_url')
4265 ext = video.group('ext')
4267 re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22' # We pick the first poster
4268 thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4270 # The only place where you can get a title, it's not complete,
4271 # but searching in other places doesn't work for all videos
4272 re_title = r'<title>(.*?) - (?P<title>.*?)</title>'
4273 title = unescapeHTML(re.search(re_title, webpage).group('title'))
4275 return [{'id': video_id,
4283 def gen_extractors():
4284 """ Return a list of an instance of every supported extractor.
4285 The order does matter; the first extractor matched is the one handling the URL.
4288 YoutubePlaylistIE(),
4313 StanfordOpenClassroomIE(),
4323 WorldStarHipHopIE(),
4340 def get_info_extractor(ie_name):
4341 """Returns the info extractor class with the given ie_name"""
4342 return globals()[ie_name+'IE']