2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 self.report_download_webpage(video_id)
118 elif note is not False:
119 self.to_screen(u'%s: %s' % (video_id, note))
121 return compat_urllib_request.urlopen(url_or_request)
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
124 errnote = u'Unable to download webpage'
125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
127 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns the data of the page as a string """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
133 encoding = m.group(1)
136 webpage_bytes = urlh.read()
137 if self._downloader.params.get('dump_intermediate_pages', False):
139 url = url_or_request.get_full_url()
140 except AttributeError:
142 self.to_screen(u'Dumping request to ' + url)
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
145 return webpage_bytes.decode(encoding, 'replace')
147 def to_screen(self, msg):
148 """Print msg to screen, prefixing it with '[ie_name]'"""
149 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
151 def report_extraction(self, id_or_name):
152 """Report information extraction."""
153 self.to_screen(u'%s: Extracting information' % id_or_name)
155 def report_download_webpage(self, video_id):
156 """Report webpage download."""
157 self.to_screen(u'%s: Downloading webpage' % video_id)
159 def report_age_confirmation(self):
160 """Report attempt to confirm age."""
161 self.to_screen(u'Confirming age')
163 #Methods for following #608
164 #They set the correct value of the '_type' key
165 def video_result(self, video_info):
166 """Returns a video"""
167 video_info['_type'] = 'video'
169 def url_result(self, url, ie=None):
170 """Returns a url that points to a page that should be processed"""
171 #TODO: ie should be the class used for getting the info
172 video_info = {'_type': 'url',
176 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
177 """Returns a playlist"""
178 video_info = {'_type': 'playlist',
181 video_info['id'] = playlist_id
183 video_info['title'] = playlist_title
187 class YoutubeIE(InfoExtractor):
188 """Information extractor for youtube.com."""
192 (?:https?://)? # http(s):// (optional)
193 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
194 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
195 (?:.*?\#/)? # handle anchor (#/) redirect urls
196 (?: # the various things that can precede the ID:
197 (?:(?:v|embed|e)/) # v/ or embed/ or e/
198 |(?: # or the v= param in all its forms
199 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
200 (?:\?|\#!?) # the params delimiter ? or # or #!
201 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
204 )? # optional -> youtube.com/xxxx is OK
205 )? # all until now is optional -> you can pass the naked ID
206 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
207 (?(1).+)? # if we found the ID, everything can follow
209 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
210 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
211 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
212 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
213 _NETRC_MACHINE = 'youtube'
214 # Listed in order of quality
215 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
216 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
217 _video_extensions = {
223 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
229 _video_dimensions = {
248 def suitable(cls, url):
249 """Receives a URL and returns True if suitable for this IE."""
250 if YoutubePlaylistIE.suitable(url): return False
251 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
253 def report_lang(self):
254 """Report attempt to set language."""
255 self.to_screen(u'Setting language')
257 def report_login(self):
258 """Report attempt to log in."""
259 self.to_screen(u'Logging in')
261 def report_video_webpage_download(self, video_id):
262 """Report attempt to download video webpage."""
263 self.to_screen(u'%s: Downloading video webpage' % video_id)
265 def report_video_info_webpage_download(self, video_id):
266 """Report attempt to download video info webpage."""
267 self.to_screen(u'%s: Downloading video info webpage' % video_id)
269 def report_video_subtitles_download(self, video_id):
270 """Report attempt to download video info webpage."""
271 self.to_screen(u'%s: Checking available subtitles' % video_id)
273 def report_video_subtitles_request(self, video_id, sub_lang, format):
274 """Report attempt to download video info webpage."""
275 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
277 def report_video_subtitles_available(self, video_id, sub_lang_list):
278 """Report available subtitles."""
279 sub_lang = ",".join(list(sub_lang_list.keys()))
280 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
282 def report_information_extraction(self, video_id):
283 """Report attempt to extract video information."""
284 self.to_screen(u'%s: Extracting video information' % video_id)
286 def report_unavailable_format(self, video_id, format):
287 """Report extracted video URL."""
288 self.to_screen(u'%s: Format %s not available' % (video_id, format))
290 def report_rtmp_download(self):
291 """Indicate the download will use the RTMP protocol."""
292 self.to_screen(u'RTMP download detected')
294 def _get_available_subtitles(self, video_id):
295 self.report_video_subtitles_download(video_id)
296 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
298 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
299 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
300 return (u'unable to download video subtitles: %s' % compat_str(err), None)
301 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
302 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
303 if not sub_lang_list:
304 return (u'video doesn\'t have subtitles', None)
307 def _list_available_subtitles(self, video_id):
308 sub_lang_list = self._get_available_subtitles(video_id)
309 self.report_video_subtitles_available(video_id, sub_lang_list)
311 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
314 (error_message, sub_lang, sub)
316 self.report_video_subtitles_request(video_id, sub_lang, format)
317 params = compat_urllib_parse.urlencode({
323 url = 'http://www.youtube.com/api/timedtext?' + params
325 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
326 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
327 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
329 return (u'Did not fetch video subtitles', None, None)
330 return (None, sub_lang, sub)
332 def _extract_subtitle(self, video_id):
334 Return a list with a tuple:
335 [(error_message, sub_lang, sub)]
337 sub_lang_list = self._get_available_subtitles(video_id)
338 sub_format = self._downloader.params.get('subtitlesformat')
339 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
340 return [(sub_lang_list[0], None, None)]
341 if self._downloader.params.get('subtitleslang', False):
342 sub_lang = self._downloader.params.get('subtitleslang')
343 elif 'en' in sub_lang_list:
346 sub_lang = list(sub_lang_list.keys())[0]
347 if not sub_lang in sub_lang_list:
348 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
350 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
353 def _extract_all_subtitles(self, video_id):
354 sub_lang_list = self._get_available_subtitles(video_id)
355 sub_format = self._downloader.params.get('subtitlesformat')
356 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
357 return [(sub_lang_list[0], None, None)]
359 for sub_lang in sub_lang_list:
360 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
361 subtitles.append(subtitle)
364 def _print_formats(self, formats):
365 print('Available formats:')
367 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
369 def _real_initialize(self):
370 if self._downloader is None:
375 downloader_params = self._downloader.params
377 # Attempt to use provided username and password or .netrc data
378 if downloader_params.get('username', None) is not None:
379 username = downloader_params['username']
380 password = downloader_params['password']
381 elif downloader_params.get('usenetrc', False):
383 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
388 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
389 except (IOError, netrc.NetrcParseError) as err:
390 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
394 request = compat_urllib_request.Request(self._LANG_URL)
397 compat_urllib_request.urlopen(request).read()
398 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
399 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
402 # No authentication to be performed
406 request = compat_urllib_request.Request(self._LOGIN_URL)
408 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
409 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
410 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
415 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
417 galx = match.group(1)
419 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
425 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
429 u'PersistentCookie': u'yes',
431 u'bgresponse': u'js_disabled',
432 u'checkConnection': u'',
433 u'checkedDomains': u'youtube',
439 u'signIn': u'Sign in',
441 u'service': u'youtube',
445 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
447 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
448 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
449 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
452 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
453 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
454 self._downloader.report_warning(u'unable to log in: bad username or password')
456 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
457 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
463 'action_confirm': 'Confirm',
465 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
467 self.report_age_confirmation()
468 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
469 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
470 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
473 def _extract_id(self, url):
474 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
476 self._downloader.report_error(u'invalid URL: %s' % url)
478 video_id = mobj.group(2)
481 def _real_extract(self, url):
482 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
483 mobj = re.search(self._NEXT_URL_RE, url)
485 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
486 video_id = self._extract_id(url)
489 self.report_video_webpage_download(video_id)
490 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
491 request = compat_urllib_request.Request(url)
493 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
494 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
495 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
498 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
500 # Attempt to extract SWF player URL
501 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
503 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
508 self.report_video_info_webpage_download(video_id)
509 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
510 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
511 % (video_id, el_type))
512 video_info_webpage = self._download_webpage(video_info_url, video_id,
514 errnote='unable to download video info webpage')
515 video_info = compat_parse_qs(video_info_webpage)
516 if 'token' in video_info:
518 if 'token' not in video_info:
519 if 'reason' in video_info:
520 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
522 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
525 # Check for "rental" videos
526 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
527 self._downloader.report_error(u'"rental" videos not supported')
530 # Start extracting information
531 self.report_information_extraction(video_id)
534 if 'author' not in video_info:
535 self._downloader.report_error(u'unable to extract uploader name')
537 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
540 video_uploader_id = None
541 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
543 video_uploader_id = mobj.group(1)
545 self._downloader.report_warning(u'unable to extract uploader nickname')
548 if 'title' not in video_info:
549 self._downloader.report_error(u'unable to extract video title')
551 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
554 if 'thumbnail_url' not in video_info:
555 self._downloader.report_warning(u'unable to extract video thumbnail')
557 else: # don't panic if we can't find it
558 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
562 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
564 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
565 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
566 for expression in format_expressions:
568 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
573 video_description = get_element_by_id("eow-description", video_webpage)
574 if video_description:
575 video_description = clean_html(video_description)
577 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
579 video_description = unescapeHTML(fd_mobj.group(1))
581 video_description = u''
584 video_subtitles = None
586 if self._downloader.params.get('writesubtitles', False):
587 video_subtitles = self._extract_subtitle(video_id)
589 (sub_error, sub_lang, sub) = video_subtitles[0]
591 self._downloader.report_error(sub_error)
593 if self._downloader.params.get('allsubtitles', False):
594 video_subtitles = self._extract_all_subtitles(video_id)
595 for video_subtitle in video_subtitles:
596 (sub_error, sub_lang, sub) = video_subtitle
598 self._downloader.report_error(sub_error)
600 if self._downloader.params.get('listsubtitles', False):
601 sub_lang_list = self._list_available_subtitles(video_id)
604 if 'length_seconds' not in video_info:
605 self._downloader.report_warning(u'unable to extract video duration')
608 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
611 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
613 # Decide which formats to download
614 req_format = self._downloader.params.get('format', None)
616 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
617 self.report_rtmp_download()
618 video_url_list = [(None, video_info['conn'][0])]
619 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
620 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
621 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
622 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
623 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
625 format_limit = self._downloader.params.get('format_limit', None)
626 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
627 if format_limit is not None and format_limit in available_formats:
628 format_list = available_formats[available_formats.index(format_limit):]
630 format_list = available_formats
631 existing_formats = [x for x in format_list if x in url_map]
632 if len(existing_formats) == 0:
633 raise ExtractorError(u'no known formats available for video')
634 if self._downloader.params.get('listformats', None):
635 self._print_formats(existing_formats)
637 if req_format is None or req_format == 'best':
638 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
639 elif req_format == 'worst':
640 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
641 elif req_format in ('-1', 'all'):
642 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
644 # Specific formats. We pick the first in a slash-delimeted sequence.
645 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
646 req_formats = req_format.split('/')
647 video_url_list = None
648 for rf in req_formats:
650 video_url_list = [(rf, url_map[rf])]
652 if video_url_list is None:
653 raise ExtractorError(u'requested format not available')
655 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
658 for format_param, video_real_url in video_url_list:
660 video_extension = self._video_extensions.get(format_param, 'flv')
662 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
663 self._video_dimensions.get(format_param, '???'))
667 'url': video_real_url,
668 'uploader': video_uploader,
669 'uploader_id': video_uploader_id,
670 'upload_date': upload_date,
671 'title': video_title,
672 'ext': video_extension,
673 'format': video_format,
674 'thumbnail': video_thumbnail,
675 'description': video_description,
676 'player_url': player_url,
677 'subtitles': video_subtitles,
678 'duration': video_duration
683 class MetacafeIE(InfoExtractor):
684 """Information Extractor for metacafe.com."""
686 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
687 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
688 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
689 IE_NAME = u'metacafe'
691 def report_disclaimer(self):
692 """Report disclaimer retrieval."""
693 self.to_screen(u'Retrieving disclaimer')
695 def _real_initialize(self):
696 # Retrieve disclaimer
697 request = compat_urllib_request.Request(self._DISCLAIMER)
699 self.report_disclaimer()
700 disclaimer = compat_urllib_request.urlopen(request).read()
701 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
702 self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
708 'submit': "Continue - I'm over 18",
710 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
712 self.report_age_confirmation()
713 disclaimer = compat_urllib_request.urlopen(request).read()
714 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
715 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
718 def _real_extract(self, url):
719 # Extract id and simplified title from URL
720 mobj = re.match(self._VALID_URL, url)
722 self._downloader.report_error(u'invalid URL: %s' % url)
725 video_id = mobj.group(1)
727 # Check if video comes from YouTube
728 mobj2 = re.match(r'^yt-(.*)$', video_id)
729 if mobj2 is not None:
730 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
732 # Retrieve video webpage to extract further information
733 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
735 # Extract URL, uploader and title from webpage
736 self.report_extraction(video_id)
737 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
739 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
740 video_extension = mediaURL[-3:]
742 # Extract gdaKey if available
743 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
747 gdaKey = mobj.group(1)
748 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
750 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
752 self._downloader.report_error(u'unable to extract media URL')
754 vardict = compat_parse_qs(mobj.group(1))
755 if 'mediaData' not in vardict:
756 self._downloader.report_error(u'unable to extract media URL')
758 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
760 self._downloader.report_error(u'unable to extract media URL')
762 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
763 video_extension = mediaURL[-3:]
764 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
766 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
768 self._downloader.report_error(u'unable to extract title')
770 video_title = mobj.group(1).decode('utf-8')
772 mobj = re.search(r'submitter=(.*?);', webpage)
774 self._downloader.report_error(u'unable to extract uploader nickname')
776 video_uploader = mobj.group(1)
779 'id': video_id.decode('utf-8'),
780 'url': video_url.decode('utf-8'),
781 'uploader': video_uploader.decode('utf-8'),
783 'title': video_title,
784 'ext': video_extension.decode('utf-8'),
788 class DailymotionIE(InfoExtractor):
789 """Information Extractor for Dailymotion"""
791 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
792 IE_NAME = u'dailymotion'
795 def _real_extract(self, url):
796 # Extract id and simplified title from URL
797 mobj = re.match(self._VALID_URL, url)
799 self._downloader.report_error(u'invalid URL: %s' % url)
802 video_id = mobj.group(1).split('_')[0].split('?')[0]
804 video_extension = 'mp4'
806 # Retrieve video webpage to extract further information
807 request = compat_urllib_request.Request(url)
808 request.add_header('Cookie', 'family_filter=off')
809 webpage = self._download_webpage(request, video_id)
811 # Extract URL, uploader and title from webpage
812 self.report_extraction(video_id)
813 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
815 self._downloader.report_error(u'unable to extract media URL')
817 flashvars = compat_urllib_parse.unquote(mobj.group(1))
819 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
822 self.to_screen(u'Using %s' % key)
825 self._downloader.report_error(u'unable to extract video URL')
828 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
830 self._downloader.report_error(u'unable to extract video URL')
833 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
835 # TODO: support choosing qualities
837 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
839 self._downloader.report_error(u'unable to extract title')
841 video_title = unescapeHTML(mobj.group('title'))
843 video_uploader = None
844 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
846 # lookin for official user
847 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
848 if mobj_official is None:
849 self._downloader.report_warning(u'unable to extract uploader nickname')
851 video_uploader = mobj_official.group(1)
853 video_uploader = mobj.group(1)
855 video_upload_date = None
856 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
858 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
863 'uploader': video_uploader,
864 'upload_date': video_upload_date,
865 'title': video_title,
866 'ext': video_extension,
870 class PhotobucketIE(InfoExtractor):
871 """Information extractor for photobucket.com."""
873 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
874 IE_NAME = u'photobucket'
876 def _real_extract(self, url):
877 # Extract id from URL
878 mobj = re.match(self._VALID_URL, url)
880 self._downloader.report_error(u'Invalid URL: %s' % url)
883 video_id = mobj.group(1)
885 video_extension = 'flv'
887 # Retrieve video webpage to extract further information
888 request = compat_urllib_request.Request(url)
890 self.report_download_webpage(video_id)
891 webpage = compat_urllib_request.urlopen(request).read()
892 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
893 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
896 # Extract URL, uploader, and title from webpage
897 self.report_extraction(video_id)
898 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
900 self._downloader.report_error(u'unable to extract media URL')
902 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
906 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
908 self._downloader.report_error(u'unable to extract title')
910 video_title = mobj.group(1).decode('utf-8')
912 video_uploader = mobj.group(2).decode('utf-8')
915 'id': video_id.decode('utf-8'),
916 'url': video_url.decode('utf-8'),
917 'uploader': video_uploader,
919 'title': video_title,
920 'ext': video_extension.decode('utf-8'),
924 class YahooIE(InfoExtractor):
925 """Information extractor for video.yahoo.com."""
928 # _VALID_URL matches all Yahoo! Video URLs
929 # _VPAGE_URL matches only the extractable '/watch/' URLs
930 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
931 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
932 IE_NAME = u'video.yahoo'
934 def _real_extract(self, url, new_video=True):
935 # Extract ID from URL
936 mobj = re.match(self._VALID_URL, url)
938 self._downloader.report_error(u'Invalid URL: %s' % url)
941 video_id = mobj.group(2)
942 video_extension = 'flv'
944 # Rewrite valid but non-extractable URLs as
945 # extractable English language /watch/ URLs
946 if re.match(self._VPAGE_URL, url) is None:
947 request = compat_urllib_request.Request(url)
949 webpage = compat_urllib_request.urlopen(request).read()
950 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
951 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
954 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
956 self._downloader.report_error(u'Unable to extract id field')
958 yahoo_id = mobj.group(1)
960 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
962 self._downloader.report_error(u'Unable to extract vid field')
964 yahoo_vid = mobj.group(1)
966 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
967 return self._real_extract(url, new_video=False)
969 # Retrieve video webpage to extract further information
970 request = compat_urllib_request.Request(url)
972 self.report_download_webpage(video_id)
973 webpage = compat_urllib_request.urlopen(request).read()
974 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
975 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
978 # Extract uploader and title from webpage
979 self.report_extraction(video_id)
980 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
982 self._downloader.report_error(u'unable to extract video title')
984 video_title = mobj.group(1).decode('utf-8')
986 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
988 self._downloader.report_error(u'unable to extract video uploader')
990 video_uploader = mobj.group(1).decode('utf-8')
992 # Extract video thumbnail
993 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
995 self._downloader.report_error(u'unable to extract video thumbnail')
997 video_thumbnail = mobj.group(1).decode('utf-8')
999 # Extract video description
1000 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1002 self._downloader.report_error(u'unable to extract video description')
1004 video_description = mobj.group(1).decode('utf-8')
1005 if not video_description:
1006 video_description = 'No description available.'
1008 # Extract video height and width
1009 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1011 self._downloader.report_error(u'unable to extract video height')
1013 yv_video_height = mobj.group(1)
1015 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1017 self._downloader.report_error(u'unable to extract video width')
1019 yv_video_width = mobj.group(1)
1021 # Retrieve video playlist to extract media URL
1022 # I'm not completely sure what all these options are, but we
1023 # seem to need most of them, otherwise the server sends a 401.
1024 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1025 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1026 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1027 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1028 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1030 self.report_download_webpage(video_id)
1031 webpage = compat_urllib_request.urlopen(request).read()
1032 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1033 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1036 # Extract media URL from playlist XML
1037 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1039 self._downloader.report_error(u'Unable to extract media URL')
1041 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1042 video_url = unescapeHTML(video_url)
1045 'id': video_id.decode('utf-8'),
1047 'uploader': video_uploader,
1048 'upload_date': None,
1049 'title': video_title,
1050 'ext': video_extension.decode('utf-8'),
1051 'thumbnail': video_thumbnail.decode('utf-8'),
1052 'description': video_description,
1056 class VimeoIE(InfoExtractor):
1057 """Information extractor for vimeo.com."""
1059 # _VALID_URL matches Vimeo URLs
1060 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1063 def _real_extract(self, url, new_video=True):
1064 # Extract ID from URL
1065 mobj = re.match(self._VALID_URL, url)
1067 self._downloader.report_error(u'Invalid URL: %s' % url)
1070 video_id = mobj.group('id')
1071 if not mobj.group('proto'):
1072 url = 'https://' + url
1073 if mobj.group('direct_link'):
1074 url = 'https://vimeo.com/' + video_id
1076 # Retrieve video webpage to extract further information
1077 request = compat_urllib_request.Request(url, None, std_headers)
1079 self.report_download_webpage(video_id)
1080 webpage_bytes = compat_urllib_request.urlopen(request).read()
1081 webpage = webpage_bytes.decode('utf-8')
1082 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1083 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1086 # Now we begin extracting as much information as we can from what we
1087 # retrieved. First we extract the information common to all extractors,
1088 # and latter we extract those that are Vimeo specific.
1089 self.report_extraction(video_id)
1091 # Extract the config JSON
1093 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1094 config = json.loads(config)
1096 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1097 self._downloader.report_error(u'The author has restricted the access to this video, try with the "--referer" option')
1099 self._downloader.report_error(u'unable to extract info section')
1103 video_title = config["video"]["title"]
1105 # Extract uploader and uploader_id
1106 video_uploader = config["video"]["owner"]["name"]
1107 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1109 # Extract video thumbnail
1110 video_thumbnail = config["video"]["thumbnail"]
1112 # Extract video description
1113 video_description = get_element_by_attribute("itemprop", "description", webpage)
1114 if video_description: video_description = clean_html(video_description)
1115 else: video_description = u''
1117 # Extract upload date
1118 video_upload_date = None
1119 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1120 if mobj is not None:
1121 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1123 # Vimeo specific: extract request signature and timestamp
1124 sig = config['request']['signature']
1125 timestamp = config['request']['timestamp']
1127 # Vimeo specific: extract video codec and quality information
1128 # First consider quality, then codecs, then take everything
1129 # TODO bind to format param
1130 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1131 files = { 'hd': [], 'sd': [], 'other': []}
1132 for codec_name, codec_extension in codecs:
1133 if codec_name in config["video"]["files"]:
1134 if 'hd' in config["video"]["files"][codec_name]:
1135 files['hd'].append((codec_name, codec_extension, 'hd'))
1136 elif 'sd' in config["video"]["files"][codec_name]:
1137 files['sd'].append((codec_name, codec_extension, 'sd'))
1139 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1141 for quality in ('hd', 'sd', 'other'):
1142 if len(files[quality]) > 0:
1143 video_quality = files[quality][0][2]
1144 video_codec = files[quality][0][0]
1145 video_extension = files[quality][0][1]
1146 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1149 self._downloader.report_error(u'no known codec found')
1152 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1153 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1158 'uploader': video_uploader,
1159 'uploader_id': video_uploader_id,
1160 'upload_date': video_upload_date,
1161 'title': video_title,
1162 'ext': video_extension,
1163 'thumbnail': video_thumbnail,
1164 'description': video_description,
1168 class ArteTvIE(InfoExtractor):
1169 """arte.tv information extractor."""
1171 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1172 _LIVE_URL = r'index-[0-9]+\.html$'
1174 IE_NAME = u'arte.tv'
1176 def fetch_webpage(self, url):
1177 request = compat_urllib_request.Request(url)
1179 self.report_download_webpage(url)
1180 webpage = compat_urllib_request.urlopen(request).read()
1181 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1182 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1184 except ValueError as err:
1185 self._downloader.report_error(u'Invalid URL: %s' % url)
1189 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1190 page = self.fetch_webpage(url)
1191 mobj = re.search(regex, page, regexFlags)
1195 self._downloader.report_error(u'Invalid URL: %s' % url)
1198 for (i, key, err) in matchTuples:
1199 if mobj.group(i) is None:
1200 self._downloader.report_error(err)
1203 info[key] = mobj.group(i)
1207 def extractLiveStream(self, url):
1208 video_lang = url.split('/')[-4]
1209 info = self.grep_webpage(
1211 r'src="(.*?/videothek_js.*?\.js)',
1214 (1, 'url', u'Invalid URL: %s' % url)
1217 http_host = url.split('/')[2]
1218 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1219 info = self.grep_webpage(
1221 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1222 '(http://.*?\.swf).*?' +
1226 (1, 'path', u'could not extract video path: %s' % url),
1227 (2, 'player', u'could not extract video player: %s' % url),
1228 (3, 'url', u'could not extract video url: %s' % url)
1231 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1233 def extractPlus7Stream(self, url):
1234 video_lang = url.split('/')[-3]
1235 info = self.grep_webpage(
1237 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1240 (1, 'url', u'Invalid URL: %s' % url)
1243 next_url = compat_urllib_parse.unquote(info.get('url'))
1244 info = self.grep_webpage(
1246 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1249 (1, 'url', u'Could not find <video> tag: %s' % url)
1252 next_url = compat_urllib_parse.unquote(info.get('url'))
1254 info = self.grep_webpage(
1256 r'<video id="(.*?)".*?>.*?' +
1257 '<name>(.*?)</name>.*?' +
1258 '<dateVideo>(.*?)</dateVideo>.*?' +
1259 '<url quality="hd">(.*?)</url>',
1262 (1, 'id', u'could not extract video id: %s' % url),
1263 (2, 'title', u'could not extract video title: %s' % url),
1264 (3, 'date', u'could not extract video date: %s' % url),
1265 (4, 'url', u'could not extract video url: %s' % url)
1270 'id': info.get('id'),
1271 'url': compat_urllib_parse.unquote(info.get('url')),
1272 'uploader': u'arte.tv',
1273 'upload_date': info.get('date'),
1274 'title': info.get('title').decode('utf-8'),
1280 def _real_extract(self, url):
1281 video_id = url.split('/')[-1]
1282 self.report_extraction(video_id)
1284 if re.search(self._LIVE_URL, video_id) is not None:
1285 self.extractLiveStream(url)
1288 info = self.extractPlus7Stream(url)
1293 class GenericIE(InfoExtractor):
1294 """Generic last-resort information extractor."""
1297 IE_NAME = u'generic'
1299 def report_download_webpage(self, video_id):
1300 """Report webpage download."""
1301 if not self._downloader.params.get('test', False):
1302 self._downloader.report_warning(u'Falling back on generic information extractor.')
1303 super(GenericIE, self).report_download_webpage(video_id)
1305 def report_following_redirect(self, new_url):
1306 """Report information extraction."""
1307 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1309 def _test_redirect(self, url):
1310 """Check if it is a redirect, like url shorteners, in case return the new url."""
1311 class HeadRequest(compat_urllib_request.Request):
1312 def get_method(self):
1315 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1317 Subclass the HTTPRedirectHandler to make it use our
1318 HeadRequest also on the redirected URL
1320 def redirect_request(self, req, fp, code, msg, headers, newurl):
1321 if code in (301, 302, 303, 307):
1322 newurl = newurl.replace(' ', '%20')
1323 newheaders = dict((k,v) for k,v in req.headers.items()
1324 if k.lower() not in ("content-length", "content-type"))
1325 return HeadRequest(newurl,
1327 origin_req_host=req.get_origin_req_host(),
1330 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1332 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1334 Fallback to GET if HEAD is not allowed (405 HTTP error)
1336 def http_error_405(self, req, fp, code, msg, headers):
1340 newheaders = dict((k,v) for k,v in req.headers.items()
1341 if k.lower() not in ("content-length", "content-type"))
1342 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1344 origin_req_host=req.get_origin_req_host(),
1348 opener = compat_urllib_request.OpenerDirector()
1349 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1350 HTTPMethodFallback, HEADRedirectHandler,
1351 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1352 opener.add_handler(handler())
1354 response = opener.open(HeadRequest(url))
1355 new_url = response.geturl()
1360 self.report_following_redirect(new_url)
1363 def _real_extract(self, url):
1364 new_url = self._test_redirect(url)
1365 if new_url: return [self.url_result(new_url)]
1367 video_id = url.split('/')[-1]
1369 webpage = self._download_webpage(url, video_id)
1370 except ValueError as err:
1371 # since this is the last-resort InfoExtractor, if
1372 # this error is thrown, it'll be thrown here
1373 self._downloader.report_error(u'Invalid URL: %s' % url)
1376 self.report_extraction(video_id)
1377 # Start with something easy: JW Player in SWFObject
1378 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1380 # Broaden the search a little bit
1381 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1383 # Broaden the search a little bit: JWPlayer JS loader
1384 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1386 self._downloader.report_error(u'Invalid URL: %s' % url)
1389 # It's possible that one of the regexes
1390 # matched, but returned an empty group:
1391 if mobj.group(1) is None:
1392 self._downloader.report_error(u'Invalid URL: %s' % url)
1395 video_url = compat_urllib_parse.unquote(mobj.group(1))
1396 video_id = os.path.basename(video_url)
1398 # here's a fun little line of code for you:
1399 video_extension = os.path.splitext(video_id)[1][1:]
1400 video_id = os.path.splitext(video_id)[0]
1402 # it's tempting to parse this further, but you would
1403 # have to take into account all the variations like
1404 # Video Title - Site Name
1405 # Site Name | Video Title
1406 # Video Title - Tagline | Site Name
1407 # and so on and so forth; it's just not practical
1408 mobj = re.search(r'<title>(.*)</title>', webpage)
1410 self._downloader.report_error(u'unable to extract title')
1412 video_title = mobj.group(1)
1414 # video uploader is domain name
1415 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1417 self._downloader.report_error(u'unable to extract title')
1419 video_uploader = mobj.group(1)
1424 'uploader': video_uploader,
1425 'upload_date': None,
1426 'title': video_title,
1427 'ext': video_extension,
1431 class YoutubeSearchIE(InfoExtractor):
1432 """Information Extractor for YouTube search queries."""
1433 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1434 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1435 _max_youtube_results = 1000
1436 IE_NAME = u'youtube:search'
1438 def report_download_page(self, query, pagenum):
1439 """Report attempt to download search page with given number."""
1440 query = query.decode(preferredencoding())
1441 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1443 def _real_extract(self, query):
1444 mobj = re.match(self._VALID_URL, query)
1446 self._downloader.report_error(u'invalid search query "%s"' % query)
1449 prefix, query = query.split(':')
1451 query = query.encode('utf-8')
1453 return self._get_n_results(query, 1)
1454 elif prefix == 'all':
1455 self._get_n_results(query, self._max_youtube_results)
1460 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1462 elif n > self._max_youtube_results:
1463 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1464 n = self._max_youtube_results
1465 return self._get_n_results(query, n)
1466 except ValueError: # parsing prefix as integer fails
1467 return self._get_n_results(query, 1)
1469 def _get_n_results(self, query, n):
1470 """Get a specified number of results for a query"""
1476 while (50 * pagenum) < limit:
1477 self.report_download_page(query, pagenum+1)
1478 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1479 request = compat_urllib_request.Request(result_url)
1481 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1482 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1483 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1485 api_response = json.loads(data)['data']
1487 if not 'items' in api_response:
1488 self._downloader.report_error(u'[youtube] No video results')
1491 new_ids = list(video['id'] for video in api_response['items'])
1492 video_ids += new_ids
1494 limit = min(n, api_response['totalItems'])
1497 if len(video_ids) > n:
1498 video_ids = video_ids[:n]
1499 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1503 class GoogleSearchIE(InfoExtractor):
1504 """Information Extractor for Google Video search queries."""
1505 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1506 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1507 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1508 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1509 _max_google_results = 1000
1510 IE_NAME = u'video.google:search'
1512 def report_download_page(self, query, pagenum):
1513 """Report attempt to download playlist page with given number."""
1514 query = query.decode(preferredencoding())
1515 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1517 def _real_extract(self, query):
1518 mobj = re.match(self._VALID_URL, query)
1520 self._downloader.report_error(u'invalid search query "%s"' % query)
1523 prefix, query = query.split(':')
1525 query = query.encode('utf-8')
1527 self._download_n_results(query, 1)
1529 elif prefix == 'all':
1530 self._download_n_results(query, self._max_google_results)
1536 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1538 elif n > self._max_google_results:
1539 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1540 n = self._max_google_results
1541 self._download_n_results(query, n)
1543 except ValueError: # parsing prefix as integer fails
1544 self._download_n_results(query, 1)
1547 def _download_n_results(self, query, n):
1548 """Downloads a specified number of results for a query"""
1554 self.report_download_page(query, pagenum)
1555 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1556 request = compat_urllib_request.Request(result_url)
1558 page = compat_urllib_request.urlopen(request).read()
1559 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1560 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1563 # Extract video identifiers
1564 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1565 video_id = mobj.group(1)
1566 if video_id not in video_ids:
1567 video_ids.append(video_id)
1568 if len(video_ids) == n:
1569 # Specified n videos reached
1570 for id in video_ids:
1571 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1574 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1575 for id in video_ids:
1576 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1579 pagenum = pagenum + 1
1582 class YahooSearchIE(InfoExtractor):
1583 """Information Extractor for Yahoo! Video search queries."""
1586 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1587 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1588 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1589 _MORE_PAGES_INDICATOR = r'\s*Next'
1590 _max_yahoo_results = 1000
1591 IE_NAME = u'video.yahoo:search'
1593 def report_download_page(self, query, pagenum):
1594 """Report attempt to download playlist page with given number."""
1595 query = query.decode(preferredencoding())
1596 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1598 def _real_extract(self, query):
1599 mobj = re.match(self._VALID_URL, query)
1601 self._downloader.report_error(u'invalid search query "%s"' % query)
1604 prefix, query = query.split(':')
1606 query = query.encode('utf-8')
1608 self._download_n_results(query, 1)
1610 elif prefix == 'all':
1611 self._download_n_results(query, self._max_yahoo_results)
1617 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1619 elif n > self._max_yahoo_results:
1620 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1621 n = self._max_yahoo_results
1622 self._download_n_results(query, n)
1624 except ValueError: # parsing prefix as integer fails
1625 self._download_n_results(query, 1)
1628 def _download_n_results(self, query, n):
1629 """Downloads a specified number of results for a query"""
1632 already_seen = set()
1636 self.report_download_page(query, pagenum)
1637 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1638 request = compat_urllib_request.Request(result_url)
1640 page = compat_urllib_request.urlopen(request).read()
1641 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1642 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1645 # Extract video identifiers
1646 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1647 video_id = mobj.group(1)
1648 if video_id not in already_seen:
1649 video_ids.append(video_id)
1650 already_seen.add(video_id)
1651 if len(video_ids) == n:
1652 # Specified n videos reached
1653 for id in video_ids:
1654 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1657 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1658 for id in video_ids:
1659 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1662 pagenum = pagenum + 1
1665 class YoutubePlaylistIE(InfoExtractor):
1666 """Information Extractor for YouTube playlists."""
1668 _VALID_URL = r"""(?:
1673 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1674 \? (?:.*?&)*? (?:p|a|list)=
1677 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1680 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1682 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1684 IE_NAME = u'youtube:playlist'
1687 def suitable(cls, url):
1688 """Receives a URL and returns True if suitable for this IE."""
1689 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1691 def report_download_page(self, playlist_id, pagenum):
1692 """Report attempt to download playlist page with given number."""
1693 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1695 def _real_extract(self, url):
1696 # Extract playlist id
1697 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1699 self._downloader.report_error(u'invalid url: %s' % url)
1702 # Download playlist videos from API
1703 playlist_id = mobj.group(1) or mobj.group(2)
1708 self.report_download_page(playlist_id, page_num)
1710 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1712 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1713 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1714 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1718 response = json.loads(page)
1719 except ValueError as err:
1720 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1723 if 'feed' not in response:
1724 self._downloader.report_error(u'Got a malformed response from YouTube API')
1726 playlist_title = response['feed']['title']['$t']
1727 if 'entry' not in response['feed']:
1728 # Number of videos is a multiple of self._MAX_RESULTS
1731 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1732 for entry in response['feed']['entry']
1733 if 'content' in entry ]
1735 if len(response['feed']['entry']) < self._MAX_RESULTS:
1739 videos = [v[1] for v in sorted(videos)]
1741 url_results = [self.url_result(url, 'Youtube') for url in videos]
1742 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1745 class YoutubeChannelIE(InfoExtractor):
1746 """Information Extractor for YouTube channels."""
1748 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1749 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1750 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1751 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1752 IE_NAME = u'youtube:channel'
1754 def report_download_page(self, channel_id, pagenum):
1755 """Report attempt to download channel page with given number."""
1756 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1758 def extract_videos_from_page(self, page):
1760 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1761 if mobj.group(1) not in ids_in_page:
1762 ids_in_page.append(mobj.group(1))
1765 def _real_extract(self, url):
1766 # Extract channel id
1767 mobj = re.match(self._VALID_URL, url)
1769 self._downloader.report_error(u'invalid url: %s' % url)
1772 # Download channel page
1773 channel_id = mobj.group(1)
1777 self.report_download_page(channel_id, pagenum)
1778 url = self._TEMPLATE_URL % (channel_id, pagenum)
1779 request = compat_urllib_request.Request(url)
1781 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1782 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1783 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1786 # Extract video identifiers
1787 ids_in_page = self.extract_videos_from_page(page)
1788 video_ids.extend(ids_in_page)
1790 # Download any subsequent channel pages using the json-based channel_ajax query
1791 if self._MORE_PAGES_INDICATOR in page:
1793 pagenum = pagenum + 1
1795 self.report_download_page(channel_id, pagenum)
1796 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1797 request = compat_urllib_request.Request(url)
1799 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1800 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1801 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1804 page = json.loads(page)
1806 ids_in_page = self.extract_videos_from_page(page['content_html'])
1807 video_ids.extend(ids_in_page)
1809 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1812 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1814 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1815 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1816 return [self.playlist_result(url_entries, channel_id)]
1819 class YoutubeUserIE(InfoExtractor):
1820 """Information Extractor for YouTube users."""
1822 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1823 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1824 _GDATA_PAGE_SIZE = 50
1825 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1826 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1827 IE_NAME = u'youtube:user'
1829 def report_download_page(self, username, start_index):
1830 """Report attempt to download user page."""
1831 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1832 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1834 def _real_extract(self, url):
1836 mobj = re.match(self._VALID_URL, url)
1838 self._downloader.report_error(u'invalid url: %s' % url)
1841 username = mobj.group(1)
1843 # Download video ids using YouTube Data API. Result size per
1844 # query is limited (currently to 50 videos) so we need to query
1845 # page by page until there are no video ids - it means we got
1852 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1853 self.report_download_page(username, start_index)
1855 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1858 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1859 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1860 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1863 # Extract video identifiers
1866 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1867 if mobj.group(1) not in ids_in_page:
1868 ids_in_page.append(mobj.group(1))
1870 video_ids.extend(ids_in_page)
1872 # A little optimization - if current page is not
1873 # "full", ie. does not contain PAGE_SIZE video ids then
1874 # we can assume that this page is the last one - there
1875 # are no more ids on further pages - no need to query
1878 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1883 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1884 url_results = [self.url_result(url, 'Youtube') for url in urls]
1885 return [self.playlist_result(url_results, playlist_title = username)]
1888 class BlipTVUserIE(InfoExtractor):
1889 """Information Extractor for blip.tv users."""
1891 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1893 IE_NAME = u'blip.tv:user'
1895 def report_download_page(self, username, pagenum):
1896 """Report attempt to download user page."""
1897 self.to_screen(u'user %s: Downloading video ids from page %d' %
1898 (username, pagenum))
1900 def _real_extract(self, url):
1902 mobj = re.match(self._VALID_URL, url)
1904 self._downloader.report_error(u'invalid url: %s' % url)
1907 username = mobj.group(1)
1909 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1911 request = compat_urllib_request.Request(url)
1914 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1915 mobj = re.search(r'data-users-id="([^"]+)"', page)
1916 page_base = page_base % mobj.group(1)
1917 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1918 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1922 # Download video ids using BlipTV Ajax calls. Result size per
1923 # query is limited (currently to 12 videos) so we need to query
1924 # page by page until there are no video ids - it means we got
1931 self.report_download_page(username, pagenum)
1932 url = page_base + "&page=" + str(pagenum)
1933 request = compat_urllib_request.Request( url )
1935 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1936 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1937 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
1940 # Extract video identifiers
1943 for mobj in re.finditer(r'href="/([^"]+)"', page):
1944 if mobj.group(1) not in ids_in_page:
1945 ids_in_page.append(unescapeHTML(mobj.group(1)))
1947 video_ids.extend(ids_in_page)
1949 # A little optimization - if current page is not
1950 # "full", ie. does not contain PAGE_SIZE video ids then
1951 # we can assume that this page is the last one - there
1952 # are no more ids on further pages - no need to query
1955 if len(ids_in_page) < self._PAGE_SIZE:
1960 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1961 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1962 return [self.playlist_result(url_entries, playlist_title = username)]
1965 class DepositFilesIE(InfoExtractor):
1966 """Information extractor for depositfiles.com"""
1968 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1970 def _real_extract(self, url):
1971 file_id = url.split('/')[-1]
1972 # Rebuild url in english locale
1973 url = 'http://depositfiles.com/en/files/' + file_id
1975 # Retrieve file webpage with 'Free download' button pressed
1976 free_download_indication = { 'gateway_result' : '1' }
1977 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1979 self.report_download_webpage(file_id)
1980 webpage = compat_urllib_request.urlopen(request).read()
1981 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1982 self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
1985 # Search for the real file URL
1986 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1987 if (mobj is None) or (mobj.group(1) is None):
1988 # Try to figure out reason of the error.
1989 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1990 if (mobj is not None) and (mobj.group(1) is not None):
1991 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1992 self._downloader.report_error(u'%s' % restriction_message)
1994 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
1997 file_url = mobj.group(1)
1998 file_extension = os.path.splitext(file_url)[1][1:]
2000 # Search for file title
2001 mobj = re.search(r'<b title="(.*?)">', webpage)
2003 self._downloader.report_error(u'unable to extract title')
2005 file_title = mobj.group(1).decode('utf-8')
2008 'id': file_id.decode('utf-8'),
2009 'url': file_url.decode('utf-8'),
2011 'upload_date': None,
2012 'title': file_title,
2013 'ext': file_extension.decode('utf-8'),
2017 class FacebookIE(InfoExtractor):
2018 """Information Extractor for Facebook"""
2020 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2021 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2022 _NETRC_MACHINE = 'facebook'
2023 IE_NAME = u'facebook'
2025 def report_login(self):
2026 """Report attempt to log in."""
2027 self.to_screen(u'Logging in')
2029 def _real_initialize(self):
2030 if self._downloader is None:
2035 downloader_params = self._downloader.params
2037 # Attempt to use provided username and password or .netrc data
2038 if downloader_params.get('username', None) is not None:
2039 useremail = downloader_params['username']
2040 password = downloader_params['password']
2041 elif downloader_params.get('usenetrc', False):
2043 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2044 if info is not None:
2048 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2049 except (IOError, netrc.NetrcParseError) as err:
2050 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2053 if useremail is None:
2062 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2065 login_results = compat_urllib_request.urlopen(request).read()
2066 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2067 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2069 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2070 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2073 def _real_extract(self, url):
2074 mobj = re.match(self._VALID_URL, url)
2076 self._downloader.report_error(u'invalid URL: %s' % url)
2078 video_id = mobj.group('ID')
2080 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2081 webpage = self._download_webpage(url, video_id)
2083 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2084 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2085 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2087 raise ExtractorError(u'Cannot parse data')
2088 data = dict(json.loads(m.group(1)))
2089 params_raw = compat_urllib_parse.unquote(data['params'])
2090 params = json.loads(params_raw)
2091 video_data = params['video_data'][0]
2092 video_url = video_data.get('hd_src')
2094 video_url = video_data['sd_src']
2096 raise ExtractorError(u'Cannot find video URL')
2097 video_duration = int(video_data['video_duration'])
2098 thumbnail = video_data['thumbnail_src']
2100 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2102 raise ExtractorError(u'Cannot find title in webpage')
2103 video_title = unescapeHTML(m.group(1))
2107 'title': video_title,
2110 'duration': video_duration,
2111 'thumbnail': thumbnail,
2116 class BlipTVIE(InfoExtractor):
2117 """Information extractor for blip.tv"""
2119 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2120 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2121 IE_NAME = u'blip.tv'
2123 def report_direct_download(self, title):
2124 """Report information extraction."""
2125 self.to_screen(u'%s: Direct download detected' % title)
2127 def _real_extract(self, url):
2128 mobj = re.match(self._VALID_URL, url)
2130 self._downloader.report_error(u'invalid URL: %s' % url)
2133 urlp = compat_urllib_parse_urlparse(url)
2134 if urlp.path.startswith('/play/'):
2135 request = compat_urllib_request.Request(url)
2136 response = compat_urllib_request.urlopen(request)
2137 redirecturl = response.geturl()
2138 rurlp = compat_urllib_parse_urlparse(redirecturl)
2139 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2140 url = 'http://blip.tv/a/a-' + file_id
2141 return self._real_extract(url)
2148 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2149 request = compat_urllib_request.Request(json_url)
2150 request.add_header('User-Agent', 'iTunes/10.6.1')
2151 self.report_extraction(mobj.group(1))
2154 urlh = compat_urllib_request.urlopen(request)
2155 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2156 basename = url.split('/')[-1]
2157 title,ext = os.path.splitext(basename)
2158 title = title.decode('UTF-8')
2159 ext = ext.replace('.', '')
2160 self.report_direct_download(title)
2165 'upload_date': None,
2170 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2171 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2172 if info is None: # Regular URL
2174 json_code_bytes = urlh.read()
2175 json_code = json_code_bytes.decode('utf-8')
2176 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2177 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2181 json_data = json.loads(json_code)
2182 if 'Post' in json_data:
2183 data = json_data['Post']
2187 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2188 video_url = data['media']['url']
2189 umobj = re.match(self._URL_EXT, video_url)
2191 raise ValueError('Can not determine filename extension')
2192 ext = umobj.group(1)
2195 'id': data['item_id'],
2197 'uploader': data['display_name'],
2198 'upload_date': upload_date,
2199 'title': data['title'],
2201 'format': data['media']['mimeType'],
2202 'thumbnail': data['thumbnailUrl'],
2203 'description': data['description'],
2204 'player_url': data['embedUrl'],
2205 'user_agent': 'iTunes/10.6.1',
2207 except (ValueError,KeyError) as err:
2208 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2214 class MyVideoIE(InfoExtractor):
2215 """Information Extractor for myvideo.de."""
2217 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2218 IE_NAME = u'myvideo'
2220 def _real_extract(self,url):
2221 mobj = re.match(self._VALID_URL, url)
2223 self._download.report_error(u'invalid URL: %s' % url)
2226 video_id = mobj.group(1)
2229 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2230 webpage = self._download_webpage(webpage_url, video_id)
2232 self.report_extraction(video_id)
2233 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2236 self._downloader.report_error(u'unable to extract media URL')
2238 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2240 mobj = re.search('<title>([^<]+)</title>', webpage)
2242 self._downloader.report_error(u'unable to extract title')
2245 video_title = mobj.group(1)
2251 'upload_date': None,
2252 'title': video_title,
2256 class ComedyCentralIE(InfoExtractor):
2257 """Information extractor for The Daily Show and Colbert Report """
2259 # urls can be abbreviations like :thedailyshow or :colbert
2260 # urls for episodes like:
2261 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2262 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2263 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2264 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2265 |(https?://)?(www\.)?
2266 (?P<showname>thedailyshow|colbertnation)\.com/
2267 (full-episodes/(?P<episode>.*)|
2269 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2270 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2273 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2275 _video_extensions = {
2283 _video_dimensions = {
2293 def suitable(cls, url):
2294 """Receives a URL and returns True if suitable for this IE."""
2295 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2297 def report_config_download(self, episode_id, media_id):
2298 self.to_screen(u'%s: Downloading configuration for %s' % (episode_id, media_id))
2300 def report_index_download(self, episode_id):
2301 self.to_screen(u'%s: Downloading show index' % episode_id)
2303 def _print_formats(self, formats):
2304 print('Available formats:')
2306 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2309 def _real_extract(self, url):
2310 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2312 self._downloader.report_error(u'invalid URL: %s' % url)
2315 if mobj.group('shortname'):
2316 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2317 url = u'http://www.thedailyshow.com/full-episodes/'
2319 url = u'http://www.colbertnation.com/full-episodes/'
2320 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2321 assert mobj is not None
2323 if mobj.group('clip'):
2324 if mobj.group('showname') == 'thedailyshow':
2325 epTitle = mobj.group('tdstitle')
2327 epTitle = mobj.group('cntitle')
2330 dlNewest = not mobj.group('episode')
2332 epTitle = mobj.group('showname')
2334 epTitle = mobj.group('episode')
2336 req = compat_urllib_request.Request(url)
2337 self.report_extraction(epTitle)
2339 htmlHandle = compat_urllib_request.urlopen(req)
2340 html = htmlHandle.read()
2341 webpage = html.decode('utf-8')
2342 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2343 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2346 url = htmlHandle.geturl()
2347 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2349 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2351 if mobj.group('episode') == '':
2352 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2354 epTitle = mobj.group('episode')
2356 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2358 if len(mMovieParams) == 0:
2359 # The Colbert Report embeds the information in a without
2360 # a URL prefix; so extract the alternate reference
2361 # and then add the URL prefix manually.
2363 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2364 if len(altMovieParams) == 0:
2365 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2368 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2370 uri = mMovieParams[0][1]
2371 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2372 self.report_index_download(epTitle)
2374 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2375 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2376 self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2381 idoc = xml.etree.ElementTree.fromstring(indexXml)
2382 itemEls = idoc.findall('.//item')
2383 for partNum,itemEl in enumerate(itemEls):
2384 mediaId = itemEl.findall('./guid')[0].text
2385 shortMediaId = mediaId.split(':')[-1]
2386 showId = mediaId.split(':')[-2].replace('.com', '')
2387 officialTitle = itemEl.findall('./title')[0].text
2388 officialDate = itemEl.findall('./pubDate')[0].text
2390 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2391 compat_urllib_parse.urlencode({'uri': mediaId}))
2392 configReq = compat_urllib_request.Request(configUrl)
2393 self.report_config_download(epTitle, shortMediaId)
2395 configXml = compat_urllib_request.urlopen(configReq).read()
2396 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2397 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2400 cdoc = xml.etree.ElementTree.fromstring(configXml)
2402 for rendition in cdoc.findall('.//rendition'):
2403 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2407 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2410 if self._downloader.params.get('listformats', None):
2411 self._print_formats([i[0] for i in turls])
2414 # For now, just pick the highest bitrate
2415 format,rtmp_video_url = turls[-1]
2417 # Get the format arg from the arg stream
2418 req_format = self._downloader.params.get('format', None)
2420 # Select format if we can find one
2423 format, rtmp_video_url = f, v
2426 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2428 raise ExtractorError(u'Cannot transform RTMP url')
2429 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2430 video_url = base + m.group('finalid')
2432 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2437 'upload_date': officialDate,
2442 'description': officialTitle,
2444 results.append(info)
2449 class EscapistIE(InfoExtractor):
2450 """Information extractor for The Escapist """
2452 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2453 IE_NAME = u'escapist'
2455 def report_config_download(self, showName):
2456 self.to_screen(u'%s: Downloading configuration' % showName)
2458 def _real_extract(self, url):
2459 mobj = re.match(self._VALID_URL, url)
2461 self._downloader.report_error(u'invalid URL: %s' % url)
2463 showName = mobj.group('showname')
2464 videoId = mobj.group('episode')
2466 self.report_extraction(showName)
2468 webPage = compat_urllib_request.urlopen(url)
2469 webPageBytes = webPage.read()
2470 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2471 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2472 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2473 self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2476 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2477 description = unescapeHTML(descMatch.group(1))
2478 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2479 imgUrl = unescapeHTML(imgMatch.group(1))
2480 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2481 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2482 configUrlMatch = re.search('config=(.*)$', playerUrl)
2483 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2485 self.report_config_download(showName)
2487 configJSON = compat_urllib_request.urlopen(configUrl)
2488 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2489 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2490 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2491 self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2494 # Technically, it's JavaScript, not JSON
2495 configJSON = configJSON.replace("'", '"')
2498 config = json.loads(configJSON)
2499 except (ValueError,) as err:
2500 self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2503 playlist = config['playlist']
2504 videoUrl = playlist[1]['url']
2509 'uploader': showName,
2510 'upload_date': None,
2513 'thumbnail': imgUrl,
2514 'description': description,
2515 'player_url': playerUrl,
2520 class CollegeHumorIE(InfoExtractor):
2521 """Information extractor for collegehumor.com"""
2524 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2525 IE_NAME = u'collegehumor'
2527 def report_manifest(self, video_id):
2528 """Report information extraction."""
2529 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2531 def _real_extract(self, url):
2532 mobj = re.match(self._VALID_URL, url)
2534 self._downloader.report_error(u'invalid URL: %s' % url)
2536 video_id = mobj.group('videoid')
2541 'upload_date': None,
2544 self.report_extraction(video_id)
2545 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2547 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2548 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2549 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2552 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2554 videoNode = mdoc.findall('./video')[0]
2555 info['description'] = videoNode.findall('./description')[0].text
2556 info['title'] = videoNode.findall('./caption')[0].text
2557 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2558 manifest_url = videoNode.findall('./file')[0].text
2560 self._downloader.report_error(u'Invalid metadata XML file')
2563 manifest_url += '?hdcore=2.10.3'
2564 self.report_manifest(video_id)
2566 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2567 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2568 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2571 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2573 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2574 node_id = media_node.attrib['url']
2575 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2576 except IndexError as err:
2577 self._downloader.report_error(u'Invalid manifest file')
2580 url_pr = compat_urllib_parse_urlparse(manifest_url)
2581 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2588 class XVideosIE(InfoExtractor):
2589 """Information extractor for xvideos.com"""
2591 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2592 IE_NAME = u'xvideos'
2594 def _real_extract(self, url):
2595 mobj = re.match(self._VALID_URL, url)
2597 self._downloader.report_error(u'invalid URL: %s' % url)
2599 video_id = mobj.group(1)
2601 webpage = self._download_webpage(url, video_id)
2603 self.report_extraction(video_id)
2607 mobj = re.search(r'flv_url=(.+?)&', webpage)
2609 self._downloader.report_error(u'unable to extract video url')
2611 video_url = compat_urllib_parse.unquote(mobj.group(1))
2615 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2617 self._downloader.report_error(u'unable to extract video title')
2619 video_title = mobj.group(1)
2622 # Extract video thumbnail
2623 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2625 self._downloader.report_error(u'unable to extract video thumbnail')
2627 video_thumbnail = mobj.group(0)
2633 'upload_date': None,
2634 'title': video_title,
2636 'thumbnail': video_thumbnail,
2637 'description': None,
2643 class SoundcloudIE(InfoExtractor):
2644 """Information extractor for soundcloud.com
2645 To access the media, the uid of the song and a stream token
2646 must be extracted from the page source and the script must make
2647 a request to media.soundcloud.com/crossdomain.xml. Then
2648 the media can be grabbed by requesting from an url composed
2649 of the stream token and uid
2652 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2653 IE_NAME = u'soundcloud'
2655 def report_resolve(self, video_id):
2656 """Report information extraction."""
2657 self.to_screen(u'%s: Resolving id' % video_id)
2659 def _real_extract(self, url):
2660 mobj = re.match(self._VALID_URL, url)
2662 self._downloader.report_error(u'invalid URL: %s' % url)
2665 # extract uploader (which is in the url)
2666 uploader = mobj.group(1)
2667 # extract simple title (uploader + slug of song title)
2668 slug_title = mobj.group(2)
2669 simple_title = uploader + u'-' + slug_title
2671 self.report_resolve('%s/%s' % (uploader, slug_title))
2673 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2674 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2675 request = compat_urllib_request.Request(resolv_url)
2677 info_json_bytes = compat_urllib_request.urlopen(request).read()
2678 info_json = info_json_bytes.decode('utf-8')
2679 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2680 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2683 info = json.loads(info_json)
2684 video_id = info['id']
2685 self.report_extraction('%s/%s' % (uploader, slug_title))
2687 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2688 request = compat_urllib_request.Request(streams_url)
2690 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2691 stream_json = stream_json_bytes.decode('utf-8')
2692 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2693 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2696 streams = json.loads(stream_json)
2697 mediaURL = streams['http_mp3_128_url']
2702 'uploader': info['user']['username'],
2703 'upload_date': info['created_at'],
2704 'title': info['title'],
2706 'description': info['description'],
2709 class SoundcloudSetIE(InfoExtractor):
2710 """Information extractor for soundcloud.com sets
2711 To access the media, the uid of the song and a stream token
2712 must be extracted from the page source and the script must make
2713 a request to media.soundcloud.com/crossdomain.xml. Then
2714 the media can be grabbed by requesting from an url composed
2715 of the stream token and uid
2718 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2719 IE_NAME = u'soundcloud'
2721 def report_resolve(self, video_id):
2722 """Report information extraction."""
2723 self.to_screen(u'%s: Resolving id' % video_id)
2725 def _real_extract(self, url):
2726 mobj = re.match(self._VALID_URL, url)
2728 self._downloader.report_error(u'invalid URL: %s' % url)
2731 # extract uploader (which is in the url)
2732 uploader = mobj.group(1)
2733 # extract simple title (uploader + slug of song title)
2734 slug_title = mobj.group(2)
2735 simple_title = uploader + u'-' + slug_title
2737 self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2739 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2740 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2741 request = compat_urllib_request.Request(resolv_url)
2743 info_json_bytes = compat_urllib_request.urlopen(request).read()
2744 info_json = info_json_bytes.decode('utf-8')
2745 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2746 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2750 info = json.loads(info_json)
2751 if 'errors' in info:
2752 for err in info['errors']:
2753 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2756 for track in info['tracks']:
2757 video_id = track['id']
2758 self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2760 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2761 request = compat_urllib_request.Request(streams_url)
2763 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2764 stream_json = stream_json_bytes.decode('utf-8')
2765 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2766 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2769 streams = json.loads(stream_json)
2770 mediaURL = streams['http_mp3_128_url']
2775 'uploader': track['user']['username'],
2776 'upload_date': track['created_at'],
2777 'title': track['title'],
2779 'description': track['description'],
2784 class InfoQIE(InfoExtractor):
2785 """Information extractor for infoq.com"""
2786 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2788 def _real_extract(self, url):
2789 mobj = re.match(self._VALID_URL, url)
2791 self._downloader.report_error(u'invalid URL: %s' % url)
2794 webpage = self._download_webpage(url, video_id=url)
2795 self.report_extraction(url)
2798 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2800 self._downloader.report_error(u'unable to extract video url')
2802 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2803 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2806 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2808 self._downloader.report_error(u'unable to extract video title')
2810 video_title = mobj.group(1)
2812 # Extract description
2813 video_description = u'No description available.'
2814 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2815 if mobj is not None:
2816 video_description = mobj.group(1)
2818 video_filename = video_url.split('/')[-1]
2819 video_id, extension = video_filename.split('.')
2825 'upload_date': None,
2826 'title': video_title,
2827 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2829 'description': video_description,
2834 class MixcloudIE(InfoExtractor):
2835 """Information extractor for www.mixcloud.com"""
2837 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2838 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2839 IE_NAME = u'mixcloud'
2841 def report_download_json(self, file_id):
2842 """Report JSON download."""
2843 self.to_screen(u'Downloading json')
2845 def get_urls(self, jsonData, fmt, bitrate='best'):
2846 """Get urls from 'audio_formats' section in json"""
2849 bitrate_list = jsonData[fmt]
2850 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2851 bitrate = max(bitrate_list) # select highest
2853 url_list = jsonData[fmt][bitrate]
2854 except TypeError: # we have no bitrate info.
2855 url_list = jsonData[fmt]
2858 def check_urls(self, url_list):
2859 """Returns 1st active url from list"""
2860 for url in url_list:
2862 compat_urllib_request.urlopen(url)
2864 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2869 def _print_formats(self, formats):
2870 print('Available formats:')
2871 for fmt in formats.keys():
2872 for b in formats[fmt]:
2874 ext = formats[fmt][b][0]
2875 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2876 except TypeError: # we have no bitrate info
2877 ext = formats[fmt][0]
2878 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2881 def _real_extract(self, url):
2882 mobj = re.match(self._VALID_URL, url)
2884 self._downloader.report_error(u'invalid URL: %s' % url)
2886 # extract uploader & filename from url
2887 uploader = mobj.group(1).decode('utf-8')
2888 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2890 # construct API request
2891 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2892 # retrieve .json file with links to files
2893 request = compat_urllib_request.Request(file_url)
2895 self.report_download_json(file_url)
2896 jsonData = compat_urllib_request.urlopen(request).read()
2897 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2898 self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
2902 json_data = json.loads(jsonData)
2903 player_url = json_data['player_swf_url']
2904 formats = dict(json_data['audio_formats'])
2906 req_format = self._downloader.params.get('format', None)
2909 if self._downloader.params.get('listformats', None):
2910 self._print_formats(formats)
2913 if req_format is None or req_format == 'best':
2914 for format_param in formats.keys():
2915 url_list = self.get_urls(formats, format_param)
2917 file_url = self.check_urls(url_list)
2918 if file_url is not None:
2921 if req_format not in formats:
2922 self._downloader.report_error(u'format is not available')
2925 url_list = self.get_urls(formats, req_format)
2926 file_url = self.check_urls(url_list)
2927 format_param = req_format
2930 'id': file_id.decode('utf-8'),
2931 'url': file_url.decode('utf-8'),
2932 'uploader': uploader.decode('utf-8'),
2933 'upload_date': None,
2934 'title': json_data['name'],
2935 'ext': file_url.split('.')[-1].decode('utf-8'),
2936 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2937 'thumbnail': json_data['thumbnail_url'],
2938 'description': json_data['description'],
2939 'player_url': player_url.decode('utf-8'),
2942 class StanfordOpenClassroomIE(InfoExtractor):
2943 """Information extractor for Stanford's Open ClassRoom"""
2945 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2946 IE_NAME = u'stanfordoc'
2948 def _real_extract(self, url):
2949 mobj = re.match(self._VALID_URL, url)
2951 raise ExtractorError(u'Invalid URL: %s' % url)
2953 if mobj.group('course') and mobj.group('video'): # A specific video
2954 course = mobj.group('course')
2955 video = mobj.group('video')
2957 'id': course + '_' + video,
2959 'upload_date': None,
2962 self.report_extraction(info['id'])
2963 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2964 xmlUrl = baseUrl + video + '.xml'
2966 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2967 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2968 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2970 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2972 info['title'] = mdoc.findall('./title')[0].text
2973 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2975 self._downloader.report_error(u'Invalid metadata XML file')
2977 info['ext'] = info['url'].rpartition('.')[2]
2979 elif mobj.group('course'): # A course page
2980 course = mobj.group('course')
2985 'upload_date': None,
2988 coursepage = self._download_webpage(url, info['id'],
2989 note='Downloading course info page',
2990 errnote='Unable to download course info page')
2992 m = re.search('<h1>([^<]+)</h1>', coursepage)
2994 info['title'] = unescapeHTML(m.group(1))
2996 info['title'] = info['id']
2998 m = re.search('<description>([^<]+)</description>', coursepage)
3000 info['description'] = unescapeHTML(m.group(1))
3002 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3005 'type': 'reference',
3006 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3010 for entry in info['list']:
3011 assert entry['type'] == 'reference'
3012 results += self.extract(entry['url'])
3016 'id': 'Stanford OpenClassroom',
3019 'upload_date': None,
3022 self.report_download_webpage(info['id'])
3023 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3025 rootpage = compat_urllib_request.urlopen(rootURL).read()
3026 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3027 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3030 info['title'] = info['id']
3032 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3035 'type': 'reference',
3036 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3041 for entry in info['list']:
3042 assert entry['type'] == 'reference'
3043 results += self.extract(entry['url'])
3046 class MTVIE(InfoExtractor):
3047 """Information extractor for MTV.com"""
3049 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3052 def _real_extract(self, url):
3053 mobj = re.match(self._VALID_URL, url)
3055 self._downloader.report_error(u'invalid URL: %s' % url)
3057 if not mobj.group('proto'):
3058 url = 'http://' + url
3059 video_id = mobj.group('videoid')
3061 webpage = self._download_webpage(url, video_id)
3063 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3065 self._downloader.report_error(u'unable to extract song name')
3067 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3068 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3070 self._downloader.report_error(u'unable to extract performer')
3072 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3073 video_title = performer + ' - ' + song_name
3075 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3077 self._downloader.report_error(u'unable to mtvn_uri')
3079 mtvn_uri = mobj.group(1)
3081 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3083 self._downloader.report_error(u'unable to extract content id')
3085 content_id = mobj.group(1)
3087 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3088 self.report_extraction(video_id)
3089 request = compat_urllib_request.Request(videogen_url)
3091 metadataXml = compat_urllib_request.urlopen(request).read()
3092 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3093 self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3096 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3097 renditions = mdoc.findall('.//rendition')
3099 # For now, always pick the highest quality.
3100 rendition = renditions[-1]
3103 _,_,ext = rendition.attrib['type'].partition('/')
3104 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3105 video_url = rendition.find('./src').text
3107 self._downloader.report_error('Invalid rendition field.')
3113 'uploader': performer,
3114 'upload_date': None,
3115 'title': video_title,
3123 class YoukuIE(InfoExtractor):
3124 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3127 nowTime = int(time.time() * 1000)
3128 random1 = random.randint(1000,1998)
3129 random2 = random.randint(1000,9999)
3131 return "%d%d%d" %(nowTime,random1,random2)
3133 def _get_file_ID_mix_string(self, seed):
3135 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3137 for i in range(len(source)):
3138 seed = (seed * 211 + 30031 ) % 65536
3139 index = math.floor(seed / 65536 * len(source) )
3140 mixed.append(source[int(index)])
3141 source.remove(source[int(index)])
3142 #return ''.join(mixed)
3145 def _get_file_id(self, fileId, seed):
3146 mixed = self._get_file_ID_mix_string(seed)
3147 ids = fileId.split('*')
3151 realId.append(mixed[int(ch)])
3152 return ''.join(realId)
3154 def _real_extract(self, url):
3155 mobj = re.match(self._VALID_URL, url)
3157 self._downloader.report_error(u'invalid URL: %s' % url)
3159 video_id = mobj.group('ID')
3161 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3163 request = compat_urllib_request.Request(info_url, None, std_headers)
3165 self.report_download_webpage(video_id)
3166 jsondata = compat_urllib_request.urlopen(request).read()
3167 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3168 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3171 self.report_extraction(video_id)
3173 jsonstr = jsondata.decode('utf-8')
3174 config = json.loads(jsonstr)
3176 video_title = config['data'][0]['title']
3177 seed = config['data'][0]['seed']
3179 format = self._downloader.params.get('format', None)
3180 supported_format = list(config['data'][0]['streamfileids'].keys())
3182 if format is None or format == 'best':
3183 if 'hd2' in supported_format:
3188 elif format == 'worst':
3196 fileid = config['data'][0]['streamfileids'][format]
3197 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3198 except (UnicodeDecodeError, ValueError, KeyError):
3199 self._downloader.report_error(u'unable to extract info section')
3203 sid = self._gen_sid()
3204 fileid = self._get_file_id(fileid, seed)
3206 #column 8,9 of fileid represent the segment number
3207 #fileid[7:9] should be changed
3208 for index, key in enumerate(keys):
3210 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3211 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3214 'id': '%s_part%02d' % (video_id, index),
3215 'url': download_url,
3217 'upload_date': None,
3218 'title': video_title,
3221 files_info.append(info)
3226 class XNXXIE(InfoExtractor):
3227 """Information extractor for xnxx.com"""
3229 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3231 VIDEO_URL_RE = r'flv_url=(.*?)&'
3232 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3233 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3235 def _real_extract(self, url):
3236 mobj = re.match(self._VALID_URL, url)
3238 self._downloader.report_error(u'invalid URL: %s' % url)
3240 video_id = mobj.group(1)
3242 self.report_download_webpage(video_id)
3244 # Get webpage content
3246 webpage_bytes = compat_urllib_request.urlopen(url).read()
3247 webpage = webpage_bytes.decode('utf-8')
3248 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3249 self._downloader.report_error(u'unable to download video webpage: %s' % err)
3252 result = re.search(self.VIDEO_URL_RE, webpage)
3254 self._downloader.report_error(u'unable to extract video url')
3256 video_url = compat_urllib_parse.unquote(result.group(1))
3258 result = re.search(self.VIDEO_TITLE_RE, webpage)
3260 self._downloader.report_error(u'unable to extract video title')
3262 video_title = result.group(1)
3264 result = re.search(self.VIDEO_THUMB_RE, webpage)
3266 self._downloader.report_error(u'unable to extract video thumbnail')
3268 video_thumbnail = result.group(1)
3274 'upload_date': None,
3275 'title': video_title,
3277 'thumbnail': video_thumbnail,
3278 'description': None,
3282 class GooglePlusIE(InfoExtractor):
3283 """Information extractor for plus.google.com."""
3285 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3286 IE_NAME = u'plus.google'
3288 def report_extract_entry(self, url):
3289 """Report downloading extry"""
3290 self.to_screen(u'Downloading entry: %s' % url)
3292 def report_date(self, upload_date):
3293 """Report downloading extry"""
3294 self.to_screen(u'Entry date: %s' % upload_date)
3296 def report_uploader(self, uploader):
3297 """Report downloading extry"""
3298 self.to_screen(u'Uploader: %s' % uploader)
3300 def report_title(self, video_title):
3301 """Report downloading extry"""
3302 self.to_screen(u'Title: %s' % video_title)
3304 def report_extract_vid_page(self, video_page):
3305 """Report information extraction."""
3306 self.to_screen(u'Extracting video page: %s' % video_page)
3308 def _real_extract(self, url):
3309 # Extract id from URL
3310 mobj = re.match(self._VALID_URL, url)
3312 self._downloader.report_error(u'Invalid URL: %s' % url)
3315 post_url = mobj.group(0)
3316 video_id = mobj.group(1)
3318 video_extension = 'flv'
3320 # Step 1, Retrieve post webpage to extract further information
3321 self.report_extract_entry(post_url)
3322 request = compat_urllib_request.Request(post_url)
3324 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3325 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3326 self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3329 # Extract update date
3331 pattern = 'title="Timestamp">(.*?)</a>'
3332 mobj = re.search(pattern, webpage)
3334 upload_date = mobj.group(1)
3335 # Convert timestring to a format suitable for filename
3336 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3337 upload_date = upload_date.strftime('%Y%m%d')
3338 self.report_date(upload_date)
3342 pattern = r'rel\="author".*?>(.*?)</a>'
3343 mobj = re.search(pattern, webpage)
3345 uploader = mobj.group(1)
3346 self.report_uploader(uploader)
3349 # Get the first line for title
3351 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3352 mobj = re.search(pattern, webpage)
3354 video_title = mobj.group(1)
3355 self.report_title(video_title)
3357 # Step 2, Stimulate clicking the image box to launch video
3358 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3359 mobj = re.search(pattern, webpage)
3361 self._downloader.report_error(u'unable to extract video page URL')
3363 video_page = mobj.group(1)
3364 request = compat_urllib_request.Request(video_page)
3366 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3367 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3368 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3370 self.report_extract_vid_page(video_page)
3373 # Extract video links on video page
3374 """Extract video links of all sizes"""
3375 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3376 mobj = re.findall(pattern, webpage)
3378 self._downloader.report_error(u'unable to extract video links')
3380 # Sort in resolution
3381 links = sorted(mobj)
3383 # Choose the lowest of the sort, i.e. highest resolution
3384 video_url = links[-1]
3385 # Only get the url. The resolution part in the tuple has no use anymore
3386 video_url = video_url[-1]
3387 # Treat escaped \u0026 style hex
3389 video_url = video_url.decode("unicode_escape")
3390 except AttributeError: # Python 3
3391 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3397 'uploader': uploader,
3398 'upload_date': upload_date,
3399 'title': video_title,
3400 'ext': video_extension,
3403 class NBAIE(InfoExtractor):
3404 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3407 def _real_extract(self, url):
3408 mobj = re.match(self._VALID_URL, url)
3410 self._downloader.report_error(u'invalid URL: %s' % url)
3413 video_id = mobj.group(1)
3414 if video_id.endswith('/index.html'):
3415 video_id = video_id[:-len('/index.html')]
3417 webpage = self._download_webpage(url, video_id)
3419 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3420 def _findProp(rexp, default=None):
3421 m = re.search(rexp, webpage)
3423 return unescapeHTML(m.group(1))
3427 shortened_video_id = video_id.rpartition('/')[2]
3428 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3430 'id': shortened_video_id,
3434 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3435 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3439 class JustinTVIE(InfoExtractor):
3440 """Information extractor for justin.tv and twitch.tv"""
3441 # TODO: One broadcast may be split into multiple videos. The key
3442 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3443 # starts at 1 and increases. Can we treat all parts as one video?
3445 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3446 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3447 _JUSTIN_PAGE_LIMIT = 100
3448 IE_NAME = u'justin.tv'
3450 def report_download_page(self, channel, offset):
3451 """Report attempt to download a single page of videos."""
3452 self.to_screen(u'%s: Downloading video information from %d to %d' %
3453 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3455 # Return count of items, list of *valid* items
3456 def _parse_page(self, url):
3458 urlh = compat_urllib_request.urlopen(url)
3459 webpage_bytes = urlh.read()
3460 webpage = webpage_bytes.decode('utf-8', 'ignore')
3461 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3462 self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3465 response = json.loads(webpage)
3466 if type(response) != list:
3467 error_text = response.get('error', 'unknown error')
3468 self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3471 for clip in response:
3472 video_url = clip['video_file_url']
3474 video_extension = os.path.splitext(video_url)[1][1:]
3475 video_date = re.sub('-', '', clip['start_time'][:10])
3476 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3477 video_id = clip['id']
3478 video_title = clip.get('title', video_id)
3482 'title': video_title,
3483 'uploader': clip.get('channel_name', video_uploader_id),
3484 'uploader_id': video_uploader_id,
3485 'upload_date': video_date,
3486 'ext': video_extension,
3488 return (len(response), info)
3490 def _real_extract(self, url):
3491 mobj = re.match(self._VALID_URL, url)
3493 self._downloader.report_error(u'invalid URL: %s' % url)
3496 api = 'http://api.justin.tv'
3497 video_id = mobj.group(mobj.lastindex)
3499 if mobj.lastindex == 1:
3501 api += '/channel/archives/%s.json'
3503 api += '/broadcast/by_archive/%s.json'
3504 api = api % (video_id,)
3506 self.report_extraction(video_id)
3510 limit = self._JUSTIN_PAGE_LIMIT
3513 self.report_download_page(video_id, offset)
3514 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3515 page_count, page_info = self._parse_page(page_url)
3516 info.extend(page_info)
3517 if not paged or page_count != limit:
3522 class FunnyOrDieIE(InfoExtractor):
3523 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3525 def _real_extract(self, url):
3526 mobj = re.match(self._VALID_URL, url)
3528 self._downloader.report_error(u'invalid URL: %s' % url)
3531 video_id = mobj.group('id')
3532 webpage = self._download_webpage(url, video_id)
3534 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3536 self._downloader.report_error(u'unable to find video information')
3537 video_url = unescapeHTML(m.group('url'))
3539 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3541 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3543 self._downloader.report_error(u'Cannot find video title')
3544 title = clean_html(m.group('title'))
3546 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3548 desc = unescapeHTML(m.group('desc'))
3557 'description': desc,
3561 class SteamIE(InfoExtractor):
3562 _VALID_URL = r"""http://store.steampowered.com/
3564 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3566 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3570 def suitable(cls, url):
3571 """Receives a URL and returns True if suitable for this IE."""
3572 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3574 def _real_extract(self, url):
3575 m = re.match(self._VALID_URL, url, re.VERBOSE)
3576 gameID = m.group('gameID')
3577 videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3578 self.report_age_confirmation()
3579 webpage = self._download_webpage(videourl, gameID)
3580 game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3582 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3583 mweb = re.finditer(urlRE, webpage)
3584 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3585 titles = re.finditer(namesRE, webpage)
3586 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3587 thumbs = re.finditer(thumbsRE, webpage)
3589 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3590 video_id = vid.group('videoID')
3591 title = vtitle.group('videoName')
3592 video_url = vid.group('videoURL')
3593 video_thumb = thumb.group('thumbnail')
3595 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3600 'title': unescapeHTML(title),
3601 'thumbnail': video_thumb
3604 return [self.playlist_result(videos, gameID, game_title)]
3606 class UstreamIE(InfoExtractor):
3607 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3608 IE_NAME = u'ustream'
3610 def _real_extract(self, url):
3611 m = re.match(self._VALID_URL, url)
3612 video_id = m.group('videoID')
3613 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3614 webpage = self._download_webpage(url, video_id)
3615 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3616 title = m.group('title')
3617 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3618 uploader = m.group('uploader')
3624 'uploader': uploader
3628 class WorldStarHipHopIE(InfoExtractor):
3629 _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3630 IE_NAME = u'WorldStarHipHop'
3632 def _real_extract(self, url):
3633 _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3635 webpage_src = compat_urllib_request.urlopen(url).read()
3636 webpage_src = webpage_src.decode('utf-8')
3638 mobj = re.search(_src_url, webpage_src)
3640 m = re.match(self._VALID_URL, url)
3641 video_id = m.group('id')
3643 if mobj is not None:
3644 video_url = mobj.group()
3645 if 'mp4' in video_url:
3650 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3653 _title = r"""<title>(.*)</title>"""
3655 mobj = re.search(_title, webpage_src)
3657 if mobj is not None:
3658 title = mobj.group(1)
3660 title = 'World Start Hip Hop - %s' % time.ctime()
3662 _thumbnail = r"""rel="image_src" href="(.*)" />"""
3663 mobj = re.search(_thumbnail, webpage_src)
3665 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3666 if mobj is not None:
3667 thumbnail = mobj.group(1)
3669 _title = r"""candytitles.*>(.*)</span>"""
3670 mobj = re.search(_title, webpage_src)
3671 if mobj is not None:
3672 title = mobj.group(1)
3679 'thumbnail' : thumbnail,
3684 class RBMARadioIE(InfoExtractor):
3685 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3687 def _real_extract(self, url):
3688 m = re.match(self._VALID_URL, url)
3689 video_id = m.group('videoID')
3691 webpage = self._download_webpage(url, video_id)
3692 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3694 raise ExtractorError(u'Cannot find metadata')
3695 json_data = m.group(1)
3698 data = json.loads(json_data)
3699 except ValueError as e:
3700 raise ExtractorError(u'Invalid JSON: ' + str(e))
3702 video_url = data['akamai_url'] + '&cbr=256'
3703 url_parts = compat_urllib_parse_urlparse(video_url)
3704 video_ext = url_parts.path.rpartition('.')[2]
3709 'title': data['title'],
3710 'description': data.get('teaser_text'),
3711 'location': data.get('country_of_origin'),
3712 'uploader': data.get('host', {}).get('name'),
3713 'uploader_id': data.get('host', {}).get('slug'),
3714 'thumbnail': data.get('image', {}).get('large_url_2x'),
3715 'duration': data.get('duration'),
3720 class YouPornIE(InfoExtractor):
3721 """Information extractor for youporn.com."""
3722 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3724 def _print_formats(self, formats):
3725 """Print all available formats"""
3726 print(u'Available formats:')
3727 print(u'ext\t\tformat')
3728 print(u'---------------------------------')
3729 for format in formats:
3730 print(u'%s\t\t%s' % (format['ext'], format['format']))
3732 def _specific(self, req_format, formats):
3734 if(x["format"]==req_format):
3738 def _real_extract(self, url):
3739 mobj = re.match(self._VALID_URL, url)
3741 self._downloader.report_error(u'invalid URL: %s' % url)
3744 video_id = mobj.group('videoid')
3746 req = compat_urllib_request.Request(url)
3747 req.add_header('Cookie', 'age_verified=1')
3748 webpage = self._download_webpage(req, video_id)
3750 # Get the video title
3751 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3753 raise ExtractorError(u'Unable to extract video title')
3754 video_title = result.group('title').strip()
3756 # Get the video date
3757 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3759 self._downloader.report_warning(u'unable to extract video date')
3762 upload_date = result.group('date').strip()
3764 # Get the video uploader
3765 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3767 self._downloader.report_warning(u'unable to extract uploader')
3768 video_uploader = None
3770 video_uploader = result.group('uploader').strip()
3771 video_uploader = clean_html( video_uploader )
3773 # Get all of the formats available
3774 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3775 result = re.search(DOWNLOAD_LIST_RE, webpage)
3777 raise ExtractorError(u'Unable to extract download list')
3778 download_list_html = result.group('download_list').strip()
3780 # Get all of the links from the page
3781 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3782 links = re.findall(LINK_RE, download_list_html)
3783 if(len(links) == 0):
3784 raise ExtractorError(u'ERROR: no known formats available for video')
3786 self.to_screen(u'Links found: %d' % len(links))
3791 # A link looks like this:
3792 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3793 # A path looks like this:
3794 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3795 video_url = unescapeHTML( link )
3796 path = compat_urllib_parse_urlparse( video_url ).path
3797 extension = os.path.splitext( path )[1][1:]
3798 format = path.split('/')[4].split('_')[:2]
3801 format = "-".join( format )
3802 title = u'%s-%s-%s' % (video_title, size, bitrate)
3807 'uploader': video_uploader,
3808 'upload_date': upload_date,
3813 'description': None,
3817 if self._downloader.params.get('listformats', None):
3818 self._print_formats(formats)
3821 req_format = self._downloader.params.get('format', None)
3822 self.to_screen(u'Format: %s' % req_format)
3824 if req_format is None or req_format == 'best':
3826 elif req_format == 'worst':
3827 return [formats[-1]]
3828 elif req_format in ('-1', 'all'):
3831 format = self._specific( req_format, formats )
3833 self._downloader.report_error(u'requested format not available')
3839 class PornotubeIE(InfoExtractor):
3840 """Information extractor for pornotube.com."""
3841 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3843 def _real_extract(self, url):
3844 mobj = re.match(self._VALID_URL, url)
3846 self._downloader.report_error(u'invalid URL: %s' % url)
3849 video_id = mobj.group('videoid')
3850 video_title = mobj.group('title')
3852 # Get webpage content
3853 webpage = self._download_webpage(url, video_id)
3856 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3857 result = re.search(VIDEO_URL_RE, webpage)
3859 self._downloader.report_error(u'unable to extract video url')
3861 video_url = compat_urllib_parse.unquote(result.group('url'))
3863 #Get the uploaded date
3864 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3865 result = re.search(VIDEO_UPLOADED_RE, webpage)
3867 self._downloader.report_error(u'unable to extract video title')
3869 upload_date = result.group('date')
3871 info = {'id': video_id,
3874 'upload_date': upload_date,
3875 'title': video_title,
3881 class YouJizzIE(InfoExtractor):
3882 """Information extractor for youjizz.com."""
3883 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3885 def _real_extract(self, url):
3886 mobj = re.match(self._VALID_URL, url)
3888 self._downloader.report_error(u'invalid URL: %s' % url)
3891 video_id = mobj.group('videoid')
3893 # Get webpage content
3894 webpage = self._download_webpage(url, video_id)
3896 # Get the video title
3897 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3899 raise ExtractorError(u'ERROR: unable to extract video title')
3900 video_title = result.group('title').strip()
3902 # Get the embed page
3903 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3905 raise ExtractorError(u'ERROR: unable to extract embed page')
3907 embed_page_url = result.group(0).strip()
3908 video_id = result.group('videoid')
3910 webpage = self._download_webpage(embed_page_url, video_id)
3913 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3915 raise ExtractorError(u'ERROR: unable to extract video url')
3916 video_url = result.group('source')
3918 info = {'id': video_id,
3920 'title': video_title,
3923 'player_url': embed_page_url}
3927 class EightTracksIE(InfoExtractor):
3929 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3931 def _real_extract(self, url):
3932 mobj = re.match(self._VALID_URL, url)
3934 raise ExtractorError(u'Invalid URL: %s' % url)
3935 playlist_id = mobj.group('id')
3937 webpage = self._download_webpage(url, playlist_id)
3939 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3941 raise ExtractorError(u'Cannot find trax information')
3942 json_like = m.group(1)
3943 data = json.loads(json_like)
3945 session = str(random.randint(0, 1000000000))
3947 track_count = data['tracks_count']
3948 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3949 next_url = first_url
3951 for i in itertools.count():
3952 api_json = self._download_webpage(next_url, playlist_id,
3953 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3954 errnote=u'Failed to download song information')
3955 api_data = json.loads(api_json)
3956 track_data = api_data[u'set']['track']
3958 'id': track_data['id'],
3959 'url': track_data['track_file_stream_url'],
3960 'title': track_data['performer'] + u' - ' + track_data['name'],
3961 'raw_title': track_data['name'],
3962 'uploader_id': data['user']['login'],
3966 if api_data['set']['at_last_track']:
3968 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3971 class KeekIE(InfoExtractor):
3972 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3975 def _real_extract(self, url):
3976 m = re.match(self._VALID_URL, url)
3977 video_id = m.group('videoID')
3978 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3979 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3980 webpage = self._download_webpage(url, video_id)
3981 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3982 title = unescapeHTML(m.group('title'))
3983 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3984 uploader = clean_html(m.group('uploader'))
3990 'thumbnail': thumbnail,
3991 'uploader': uploader
3995 class TEDIE(InfoExtractor):
3996 _VALID_URL=r'''http://www.ted.com/
3998 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4000 ((?P<type_talk>talks)) # We have a simple talk
4002 /(?P<name>\w+) # Here goes the name and then ".html"
4006 def suitable(cls, url):
4007 """Receives a URL and returns True if suitable for this IE."""
4008 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4010 def _real_extract(self, url):
4011 m=re.match(self._VALID_URL, url, re.VERBOSE)
4012 if m.group('type_talk'):
4013 return [self._talk_info(url)]
4015 playlist_id=m.group('playlist_id')
4016 name=m.group('name')
4017 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
4018 return [self._playlist_videos_info(url,name,playlist_id)]
4020 def _talk_video_link(self,mediaSlug):
4021 '''Returns the video link for that mediaSlug'''
4022 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4024 def _playlist_videos_info(self,url,name,playlist_id=0):
4025 '''Returns the videos of the playlist'''
4027 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4028 ([.\s]*?)data-playlist_item_id="(\d+)"
4029 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4031 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4032 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4033 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4034 m_names=re.finditer(video_name_RE,webpage)
4036 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
4037 m_playlist = re.search(playlist_RE, webpage)
4038 playlist_title = m_playlist.group('playlist_title')
4040 playlist_entries = []
4041 for m_video, m_name in zip(m_videos,m_names):
4042 video_id=m_video.group('video_id')
4043 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4044 playlist_entries.append(self.url_result(talk_url, 'TED'))
4045 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
4047 def _talk_info(self, url, video_id=0):
4048 """Return the video for the talk in the url"""
4049 m=re.match(self._VALID_URL, url,re.VERBOSE)
4050 videoName=m.group('name')
4051 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4052 # If the url includes the language we get the title translated
4053 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4054 title=re.search(title_RE, webpage).group('title')
4055 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4056 "id":(?P<videoID>[\d]+).*?
4057 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4058 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4059 thumb_match=re.search(thumb_RE,webpage)
4060 info_match=re.search(info_RE,webpage,re.VERBOSE)
4061 video_id=info_match.group('videoID')
4062 mediaSlug=info_match.group('mediaSlug')
4063 video_url=self._talk_video_link(mediaSlug)
4069 'thumbnail': thumb_match.group('thumbnail')
4073 class MySpassIE(InfoExtractor):
4074 _VALID_URL = r'http://www.myspass.de/.*'
4076 def _real_extract(self, url):
4077 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4079 # video id is the last path element of the URL
4080 # usually there is a trailing slash, so also try the second but last
4081 url_path = compat_urllib_parse_urlparse(url).path
4082 url_parent_path, video_id = os.path.split(url_path)
4084 _, video_id = os.path.split(url_parent_path)
4087 metadata_url = META_DATA_URL_TEMPLATE % video_id
4088 metadata_text = self._download_webpage(metadata_url, video_id)
4089 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4091 # extract values from metadata
4092 url_flv_el = metadata.find('url_flv')
4093 if url_flv_el is None:
4094 self._downloader.report_error(u'unable to extract download url')
4096 video_url = url_flv_el.text
4097 extension = os.path.splitext(video_url)[1][1:]
4098 title_el = metadata.find('title')
4099 if title_el is None:
4100 self._downloader.report_error(u'unable to extract title')
4102 title = title_el.text
4103 format_id_el = metadata.find('format_id')
4104 if format_id_el is None:
4107 format = format_id_el.text
4108 description_el = metadata.find('description')
4109 if description_el is not None:
4110 description = description_el.text
4113 imagePreview_el = metadata.find('imagePreview')
4114 if imagePreview_el is not None:
4115 thumbnail = imagePreview_el.text
4124 'thumbnail': thumbnail,
4125 'description': description
4129 class SpiegelIE(InfoExtractor):
4130 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4132 def _real_extract(self, url):
4133 m = re.match(self._VALID_URL, url)
4134 video_id = m.group('videoID')
4136 webpage = self._download_webpage(url, video_id)
4137 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4139 raise ExtractorError(u'Cannot find title')
4140 video_title = unescapeHTML(m.group(1))
4142 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4143 xml_code = self._download_webpage(xml_url, video_id,
4144 note=u'Downloading XML', errnote=u'Failed to download XML')
4146 idoc = xml.etree.ElementTree.fromstring(xml_code)
4147 last_type = idoc[-1]
4148 filename = last_type.findall('./filename')[0].text
4149 duration = float(last_type.findall('./duration')[0].text)
4151 video_url = 'http://video2.spiegel.de/flash/' + filename
4152 video_ext = filename.rpartition('.')[2]
4157 'title': video_title,
4158 'duration': duration,
4162 class LiveLeakIE(InfoExtractor):
4164 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4165 IE_NAME = u'liveleak'
4167 def _real_extract(self, url):
4168 mobj = re.match(self._VALID_URL, url)
4170 self._downloader.report_error(u'invalid URL: %s' % url)
4173 video_id = mobj.group('video_id')
4175 webpage = self._download_webpage(url, video_id)
4177 m = re.search(r'file: "(.*?)",', webpage)
4179 self._downloader.report_error(u'unable to find video url')
4181 video_url = m.group(1)
4183 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4185 self._downloader.report_error(u'Cannot find video title')
4186 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4188 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4190 desc = unescapeHTML(m.group('desc'))
4194 m = re.search(r'By:.*?(\w+)</a>', webpage)
4196 uploader = clean_html(m.group(1))
4205 'description': desc,
4206 'uploader': uploader
4211 class ARDIE(InfoExtractor):
4212 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4213 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4214 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4216 def _real_extract(self, url):
4217 # determine video id from url
4218 m = re.match(self._VALID_URL, url)
4220 numid = re.search(r'documentId=([0-9]+)', url)
4222 video_id = numid.group(1)
4224 video_id = m.group('video_id')
4226 # determine title and media streams from webpage
4227 html = self._download_webpage(url, video_id)
4228 title = re.search(self._TITLE, html).group('title')
4229 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4231 assert '"fsk"' in html
4232 self._downloader.report_error(u'this video is only available after 8:00 pm')
4235 # choose default media type and highest quality for now
4236 stream = max([s for s in streams if int(s["media_type"]) == 0],
4237 key=lambda s: int(s["quality"]))
4239 # there's two possibilities: RTMP stream or HTTP download
4240 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4241 if stream['rtmp_url']:
4242 self.to_screen(u'RTMP download detected')
4243 assert stream['video_url'].startswith('mp4:')
4244 info["url"] = stream["rtmp_url"]
4245 info["play_path"] = stream['video_url']
4247 assert stream["video_url"].endswith('.mp4')
4248 info["url"] = stream["video_url"]
4251 class TumblrIE(InfoExtractor):
4252 _VALID_URL = r'http://(?P<blog_name>.*?).tumblr.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4254 def _real_extract(self, url):
4255 m_url = re.match(self._VALID_URL, url)
4256 video_id = m_url.group('id')
4257 blog = m_url.group('blog_name')
4259 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4260 webpage = self._download_webpage(url, video_id)
4262 re_video = r'src=\\x22(?P<video_url>http://%s.tumblr.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4263 video = re.search(re_video, webpage)
4265 self.to_screen("No video founded")
4267 video_url = video.group('video_url')
4268 ext = video.group('ext')
4270 re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22' # We pick the first poster
4271 thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4273 # The only place where you can get a title, it's not complete,
4274 # but searching in other places doesn't work for all videos
4275 re_title = r'<title>(.*?) - (?P<title>.*?)</title>'
4276 title = unescapeHTML(re.search(re_title, webpage).group('title'))
4278 return [{'id': video_id,
4286 def gen_extractors():
4287 """ Return a list of an instance of every supported extractor.
4288 The order does matter; the first extractor matched is the one handling the URL.
4291 YoutubePlaylistIE(),
4316 StanfordOpenClassroomIE(),
4326 WorldStarHipHopIE(),
4343 def get_info_extractor(ie_name):
4344 """Returns the info extractor class with the given ie_name"""
4345 return globals()[ie_name+'IE']