2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 self.report_download_webpage(video_id)
118 elif note is not False:
119 self.to_screen(u'%s: %s' % (video_id, note))
121 return compat_urllib_request.urlopen(url_or_request)
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
124 errnote = u'Unable to download webpage'
125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
127 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns the data of the page as a string """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
133 encoding = m.group(1)
136 webpage_bytes = urlh.read()
137 if self._downloader.params.get('dump_intermediate_pages', False):
139 url = url_or_request.get_full_url()
140 except AttributeError:
142 self.to_screen(u'Dumping request to ' + url)
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
145 return webpage_bytes.decode(encoding, 'replace')
147 def to_screen(self, msg):
148 """Print msg to screen, prefixing it with '[ie_name]'"""
149 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
151 def report_extraction(self, id_or_name):
152 """Report information extraction."""
153 self.to_screen(u'%s: Extracting information' % id_or_name)
155 def report_download_webpage(self, video_id):
156 """Report webpage download."""
157 self.to_screen(u'%s: Downloading webpage' % video_id)
159 def report_age_confirmation(self):
160 """Report attempt to confirm age."""
161 self.to_screen(u'Confirming age')
163 #Methods for following #608
164 #They set the correct value of the '_type' key
165 def video_result(self, video_info):
166 """Returns a video"""
167 video_info['_type'] = 'video'
169 def url_result(self, url, ie=None):
170 """Returns a url that points to a page that should be processed"""
171 #TODO: ie should be the class used for getting the info
172 video_info = {'_type': 'url',
176 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
177 """Returns a playlist"""
178 video_info = {'_type': 'playlist',
181 video_info['id'] = playlist_id
183 video_info['title'] = playlist_title
187 class YoutubeIE(InfoExtractor):
188 """Information extractor for youtube.com."""
192 (?:https?://)? # http(s):// (optional)
193 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
194 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
195 (?:.*?\#/)? # handle anchor (#/) redirect urls
196 (?: # the various things that can precede the ID:
197 (?:(?:v|embed|e)/) # v/ or embed/ or e/
198 |(?: # or the v= param in all its forms
199 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
200 (?:\?|\#!?) # the params delimiter ? or # or #!
201 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
204 )? # optional -> youtube.com/xxxx is OK
205 )? # all until now is optional -> you can pass the naked ID
206 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
207 (?(1).+)? # if we found the ID, everything can follow
209 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
210 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
211 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
212 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
213 _NETRC_MACHINE = 'youtube'
214 # Listed in order of quality
215 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
216 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
217 _video_extensions = {
223 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
229 _video_dimensions = {
248 def suitable(cls, url):
249 """Receives a URL and returns True if suitable for this IE."""
250 if YoutubePlaylistIE.suitable(url): return False
251 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
253 def report_lang(self):
254 """Report attempt to set language."""
255 self.to_screen(u'Setting language')
257 def report_login(self):
258 """Report attempt to log in."""
259 self.to_screen(u'Logging in')
261 def report_video_webpage_download(self, video_id):
262 """Report attempt to download video webpage."""
263 self.to_screen(u'%s: Downloading video webpage' % video_id)
265 def report_video_info_webpage_download(self, video_id):
266 """Report attempt to download video info webpage."""
267 self.to_screen(u'%s: Downloading video info webpage' % video_id)
269 def report_video_subtitles_download(self, video_id):
270 """Report attempt to download video info webpage."""
271 self.to_screen(u'%s: Checking available subtitles' % video_id)
273 def report_video_subtitles_request(self, video_id, sub_lang, format):
274 """Report attempt to download video info webpage."""
275 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
277 def report_video_subtitles_available(self, video_id, sub_lang_list):
278 """Report available subtitles."""
279 sub_lang = ",".join(list(sub_lang_list.keys()))
280 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
282 def report_information_extraction(self, video_id):
283 """Report attempt to extract video information."""
284 self.to_screen(u'%s: Extracting video information' % video_id)
286 def report_unavailable_format(self, video_id, format):
287 """Report extracted video URL."""
288 self.to_screen(u'%s: Format %s not available' % (video_id, format))
290 def report_rtmp_download(self):
291 """Indicate the download will use the RTMP protocol."""
292 self.to_screen(u'RTMP download detected')
294 def _get_available_subtitles(self, video_id):
295 self.report_video_subtitles_download(video_id)
296 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
298 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
299 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
300 return (u'unable to download video subtitles: %s' % compat_str(err), None)
301 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
302 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
303 if not sub_lang_list:
304 return (u'video doesn\'t have subtitles', None)
307 def _list_available_subtitles(self, video_id):
308 sub_lang_list = self._get_available_subtitles(video_id)
309 self.report_video_subtitles_available(video_id, sub_lang_list)
311 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
314 (error_message, sub_lang, sub)
316 self.report_video_subtitles_request(video_id, sub_lang, format)
317 params = compat_urllib_parse.urlencode({
323 url = 'http://www.youtube.com/api/timedtext?' + params
325 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
326 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
327 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
329 return (u'Did not fetch video subtitles', None, None)
330 return (None, sub_lang, sub)
332 def _extract_subtitle(self, video_id):
334 Return a list with a tuple:
335 [(error_message, sub_lang, sub)]
337 sub_lang_list = self._get_available_subtitles(video_id)
338 sub_format = self._downloader.params.get('subtitlesformat')
339 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
340 return [(sub_lang_list[0], None, None)]
341 if self._downloader.params.get('subtitleslang', False):
342 sub_lang = self._downloader.params.get('subtitleslang')
343 elif 'en' in sub_lang_list:
346 sub_lang = list(sub_lang_list.keys())[0]
347 if not sub_lang in sub_lang_list:
348 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
350 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
353 def _extract_all_subtitles(self, video_id):
354 sub_lang_list = self._get_available_subtitles(video_id)
355 sub_format = self._downloader.params.get('subtitlesformat')
356 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
357 return [(sub_lang_list[0], None, None)]
359 for sub_lang in sub_lang_list:
360 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
361 subtitles.append(subtitle)
364 def _print_formats(self, formats):
365 print('Available formats:')
367 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
369 def _real_initialize(self):
370 if self._downloader is None:
375 downloader_params = self._downloader.params
377 # Attempt to use provided username and password or .netrc data
378 if downloader_params.get('username', None) is not None:
379 username = downloader_params['username']
380 password = downloader_params['password']
381 elif downloader_params.get('usenetrc', False):
383 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
388 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
389 except (IOError, netrc.NetrcParseError) as err:
390 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
394 request = compat_urllib_request.Request(self._LANG_URL)
397 compat_urllib_request.urlopen(request).read()
398 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
399 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
402 # No authentication to be performed
406 request = compat_urllib_request.Request(self._LOGIN_URL)
408 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
409 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
410 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
415 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
417 galx = match.group(1)
419 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
425 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
429 u'PersistentCookie': u'yes',
431 u'bgresponse': u'js_disabled',
432 u'checkConnection': u'',
433 u'checkedDomains': u'youtube',
439 u'signIn': u'Sign in',
441 u'service': u'youtube',
445 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
447 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
448 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
449 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
452 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
453 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
454 self._downloader.report_warning(u'unable to log in: bad username or password')
456 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
457 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
463 'action_confirm': 'Confirm',
465 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
467 self.report_age_confirmation()
468 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
469 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
470 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
473 def _extract_id(self, url):
474 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
476 self._downloader.report_error(u'invalid URL: %s' % url)
478 video_id = mobj.group(2)
481 def _real_extract(self, url):
482 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
483 mobj = re.search(self._NEXT_URL_RE, url)
485 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
486 video_id = self._extract_id(url)
489 self.report_video_webpage_download(video_id)
490 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
491 request = compat_urllib_request.Request(url)
493 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
494 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
495 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
498 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
500 # Attempt to extract SWF player URL
501 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
503 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
508 self.report_video_info_webpage_download(video_id)
509 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
510 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
511 % (video_id, el_type))
512 video_info_webpage = self._download_webpage(video_info_url, video_id,
514 errnote='unable to download video info webpage')
515 video_info = compat_parse_qs(video_info_webpage)
516 if 'token' in video_info:
518 if 'token' not in video_info:
519 if 'reason' in video_info:
520 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
522 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
525 # Check for "rental" videos
526 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
527 self._downloader.report_error(u'"rental" videos not supported')
530 # Start extracting information
531 self.report_information_extraction(video_id)
534 if 'author' not in video_info:
535 self._downloader.report_error(u'unable to extract uploader name')
537 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
540 video_uploader_id = None
541 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
543 video_uploader_id = mobj.group(1)
545 self._downloader.report_warning(u'unable to extract uploader nickname')
548 if 'title' not in video_info:
549 self._downloader.report_error(u'unable to extract video title')
551 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
554 if 'thumbnail_url' not in video_info:
555 self._downloader.report_warning(u'unable to extract video thumbnail')
557 else: # don't panic if we can't find it
558 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
562 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
564 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
565 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
566 for expression in format_expressions:
568 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
573 video_description = get_element_by_id("eow-description", video_webpage)
574 if video_description:
575 video_description = clean_html(video_description)
577 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
579 video_description = unescapeHTML(fd_mobj.group(1))
581 video_description = u''
584 video_subtitles = None
586 if self._downloader.params.get('writesubtitles', False):
587 video_subtitles = self._extract_subtitle(video_id)
589 (sub_error, sub_lang, sub) = video_subtitles[0]
591 self._downloader.report_error(sub_error)
593 if self._downloader.params.get('allsubtitles', False):
594 video_subtitles = self._extract_all_subtitles(video_id)
595 for video_subtitle in video_subtitles:
596 (sub_error, sub_lang, sub) = video_subtitle
598 self._downloader.report_error(sub_error)
600 if self._downloader.params.get('listsubtitles', False):
601 sub_lang_list = self._list_available_subtitles(video_id)
604 if 'length_seconds' not in video_info:
605 self._downloader.report_warning(u'unable to extract video duration')
608 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
611 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
613 # Decide which formats to download
614 req_format = self._downloader.params.get('format', None)
616 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
617 self.report_rtmp_download()
618 video_url_list = [(None, video_info['conn'][0])]
619 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
620 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
621 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
622 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
623 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
625 format_limit = self._downloader.params.get('format_limit', None)
626 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
627 if format_limit is not None and format_limit in available_formats:
628 format_list = available_formats[available_formats.index(format_limit):]
630 format_list = available_formats
631 existing_formats = [x for x in format_list if x in url_map]
632 if len(existing_formats) == 0:
633 raise ExtractorError(u'no known formats available for video')
634 if self._downloader.params.get('listformats', None):
635 self._print_formats(existing_formats)
637 if req_format is None or req_format == 'best':
638 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
639 elif req_format == 'worst':
640 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
641 elif req_format in ('-1', 'all'):
642 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
644 # Specific formats. We pick the first in a slash-delimeted sequence.
645 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
646 req_formats = req_format.split('/')
647 video_url_list = None
648 for rf in req_formats:
650 video_url_list = [(rf, url_map[rf])]
652 if video_url_list is None:
653 raise ExtractorError(u'requested format not available')
655 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
658 for format_param, video_real_url in video_url_list:
660 video_extension = self._video_extensions.get(format_param, 'flv')
662 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
663 self._video_dimensions.get(format_param, '???'))
667 'url': video_real_url,
668 'uploader': video_uploader,
669 'uploader_id': video_uploader_id,
670 'upload_date': upload_date,
671 'title': video_title,
672 'ext': video_extension,
673 'format': video_format,
674 'thumbnail': video_thumbnail,
675 'description': video_description,
676 'player_url': player_url,
677 'subtitles': video_subtitles,
678 'duration': video_duration
683 class MetacafeIE(InfoExtractor):
684 """Information Extractor for metacafe.com."""
686 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
687 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
688 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
689 IE_NAME = u'metacafe'
691 def report_disclaimer(self):
692 """Report disclaimer retrieval."""
693 self.to_screen(u'Retrieving disclaimer')
695 def _real_initialize(self):
696 # Retrieve disclaimer
697 request = compat_urllib_request.Request(self._DISCLAIMER)
699 self.report_disclaimer()
700 disclaimer = compat_urllib_request.urlopen(request).read()
701 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
702 self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
708 'submit': "Continue - I'm over 18",
710 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
712 self.report_age_confirmation()
713 disclaimer = compat_urllib_request.urlopen(request).read()
714 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
715 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
718 def _real_extract(self, url):
719 # Extract id and simplified title from URL
720 mobj = re.match(self._VALID_URL, url)
722 self._downloader.report_error(u'invalid URL: %s' % url)
725 video_id = mobj.group(1)
727 # Check if video comes from YouTube
728 mobj2 = re.match(r'^yt-(.*)$', video_id)
729 if mobj2 is not None:
730 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
732 # Retrieve video webpage to extract further information
733 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
735 # Extract URL, uploader and title from webpage
736 self.report_extraction(video_id)
737 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
739 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
740 video_extension = mediaURL[-3:]
742 # Extract gdaKey if available
743 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
747 gdaKey = mobj.group(1)
748 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
750 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
752 self._downloader.report_error(u'unable to extract media URL')
754 vardict = compat_parse_qs(mobj.group(1))
755 if 'mediaData' not in vardict:
756 self._downloader.report_error(u'unable to extract media URL')
758 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
760 self._downloader.report_error(u'unable to extract media URL')
762 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
763 video_extension = mediaURL[-3:]
764 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
766 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
768 self._downloader.report_error(u'unable to extract title')
770 video_title = mobj.group(1).decode('utf-8')
772 mobj = re.search(r'submitter=(.*?);', webpage)
774 self._downloader.report_error(u'unable to extract uploader nickname')
776 video_uploader = mobj.group(1)
779 'id': video_id.decode('utf-8'),
780 'url': video_url.decode('utf-8'),
781 'uploader': video_uploader.decode('utf-8'),
783 'title': video_title,
784 'ext': video_extension.decode('utf-8'),
788 class DailymotionIE(InfoExtractor):
789 """Information Extractor for Dailymotion"""
791 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
792 IE_NAME = u'dailymotion'
795 def _real_extract(self, url):
796 # Extract id and simplified title from URL
797 mobj = re.match(self._VALID_URL, url)
799 self._downloader.report_error(u'invalid URL: %s' % url)
802 video_id = mobj.group(1).split('_')[0].split('?')[0]
804 video_extension = 'mp4'
806 # Retrieve video webpage to extract further information
807 request = compat_urllib_request.Request(url)
808 request.add_header('Cookie', 'family_filter=off')
809 webpage = self._download_webpage(request, video_id)
811 # Extract URL, uploader and title from webpage
812 self.report_extraction(video_id)
813 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
815 self._downloader.report_error(u'unable to extract media URL')
817 flashvars = compat_urllib_parse.unquote(mobj.group(1))
819 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
822 self.to_screen(u'Using %s' % key)
825 self._downloader.report_error(u'unable to extract video URL')
828 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
830 self._downloader.report_error(u'unable to extract video URL')
833 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
835 # TODO: support choosing qualities
837 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
839 self._downloader.report_error(u'unable to extract title')
841 video_title = unescapeHTML(mobj.group('title'))
843 video_uploader = None
844 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
846 # lookin for official user
847 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
848 if mobj_official is None:
849 self._downloader.report_warning(u'unable to extract uploader nickname')
851 video_uploader = mobj_official.group(1)
853 video_uploader = mobj.group(1)
855 video_upload_date = None
856 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
858 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
863 'uploader': video_uploader,
864 'upload_date': video_upload_date,
865 'title': video_title,
866 'ext': video_extension,
870 class PhotobucketIE(InfoExtractor):
871 """Information extractor for photobucket.com."""
873 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
874 IE_NAME = u'photobucket'
876 def _real_extract(self, url):
877 # Extract id from URL
878 mobj = re.match(self._VALID_URL, url)
880 self._downloader.report_error(u'Invalid URL: %s' % url)
883 video_id = mobj.group(1)
885 video_extension = 'flv'
887 # Retrieve video webpage to extract further information
888 request = compat_urllib_request.Request(url)
890 self.report_download_webpage(video_id)
891 webpage = compat_urllib_request.urlopen(request).read()
892 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
893 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
896 # Extract URL, uploader, and title from webpage
897 self.report_extraction(video_id)
898 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
900 self._downloader.report_error(u'unable to extract media URL')
902 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
906 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
908 self._downloader.report_error(u'unable to extract title')
910 video_title = mobj.group(1).decode('utf-8')
912 video_uploader = mobj.group(2).decode('utf-8')
915 'id': video_id.decode('utf-8'),
916 'url': video_url.decode('utf-8'),
917 'uploader': video_uploader,
919 'title': video_title,
920 'ext': video_extension.decode('utf-8'),
924 class YahooIE(InfoExtractor):
925 """Information extractor for video.yahoo.com."""
928 # _VALID_URL matches all Yahoo! Video URLs
929 # _VPAGE_URL matches only the extractable '/watch/' URLs
930 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
931 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
932 IE_NAME = u'video.yahoo'
934 def _real_extract(self, url, new_video=True):
935 # Extract ID from URL
936 mobj = re.match(self._VALID_URL, url)
938 self._downloader.report_error(u'Invalid URL: %s' % url)
941 video_id = mobj.group(2)
942 video_extension = 'flv'
944 # Rewrite valid but non-extractable URLs as
945 # extractable English language /watch/ URLs
946 if re.match(self._VPAGE_URL, url) is None:
947 request = compat_urllib_request.Request(url)
949 webpage = compat_urllib_request.urlopen(request).read()
950 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
951 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
954 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
956 self._downloader.report_error(u'Unable to extract id field')
958 yahoo_id = mobj.group(1)
960 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
962 self._downloader.report_error(u'Unable to extract vid field')
964 yahoo_vid = mobj.group(1)
966 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
967 return self._real_extract(url, new_video=False)
969 # Retrieve video webpage to extract further information
970 request = compat_urllib_request.Request(url)
972 self.report_download_webpage(video_id)
973 webpage = compat_urllib_request.urlopen(request).read()
974 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
975 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
978 # Extract uploader and title from webpage
979 self.report_extraction(video_id)
980 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
982 self._downloader.report_error(u'unable to extract video title')
984 video_title = mobj.group(1).decode('utf-8')
986 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
988 self._downloader.report_error(u'unable to extract video uploader')
990 video_uploader = mobj.group(1).decode('utf-8')
992 # Extract video thumbnail
993 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
995 self._downloader.report_error(u'unable to extract video thumbnail')
997 video_thumbnail = mobj.group(1).decode('utf-8')
999 # Extract video description
1000 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1002 self._downloader.report_error(u'unable to extract video description')
1004 video_description = mobj.group(1).decode('utf-8')
1005 if not video_description:
1006 video_description = 'No description available.'
1008 # Extract video height and width
1009 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1011 self._downloader.report_error(u'unable to extract video height')
1013 yv_video_height = mobj.group(1)
1015 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1017 self._downloader.report_error(u'unable to extract video width')
1019 yv_video_width = mobj.group(1)
1021 # Retrieve video playlist to extract media URL
1022 # I'm not completely sure what all these options are, but we
1023 # seem to need most of them, otherwise the server sends a 401.
1024 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1025 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1026 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1027 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1028 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1030 self.report_download_webpage(video_id)
1031 webpage = compat_urllib_request.urlopen(request).read()
1032 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1033 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1036 # Extract media URL from playlist XML
1037 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1039 self._downloader.report_error(u'Unable to extract media URL')
1041 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1042 video_url = unescapeHTML(video_url)
1045 'id': video_id.decode('utf-8'),
1047 'uploader': video_uploader,
1048 'upload_date': None,
1049 'title': video_title,
1050 'ext': video_extension.decode('utf-8'),
1051 'thumbnail': video_thumbnail.decode('utf-8'),
1052 'description': video_description,
1056 class VimeoIE(InfoExtractor):
1057 """Information extractor for vimeo.com."""
1059 # _VALID_URL matches Vimeo URLs
1060 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1063 def _real_extract(self, url, new_video=True):
1064 # Extract ID from URL
1065 mobj = re.match(self._VALID_URL, url)
1067 self._downloader.report_error(u'Invalid URL: %s' % url)
1070 video_id = mobj.group('id')
1071 if not mobj.group('proto'):
1072 url = 'https://' + url
1073 if mobj.group('direct_link'):
1074 url = 'https://vimeo.com/' + video_id
1076 # Retrieve video webpage to extract further information
1077 request = compat_urllib_request.Request(url, None, std_headers)
1079 self.report_download_webpage(video_id)
1080 webpage_bytes = compat_urllib_request.urlopen(request).read()
1081 webpage = webpage_bytes.decode('utf-8')
1082 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1083 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1086 # Now we begin extracting as much information as we can from what we
1087 # retrieved. First we extract the information common to all extractors,
1088 # and latter we extract those that are Vimeo specific.
1089 self.report_extraction(video_id)
1091 # Extract the config JSON
1093 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1094 config = json.loads(config)
1096 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1097 self._downloader.report_error(u'The author has restricted the access to this video, try with the "--referer" option')
1099 self._downloader.report_error(u'unable to extract info section')
1103 video_title = config["video"]["title"]
1105 # Extract uploader and uploader_id
1106 video_uploader = config["video"]["owner"]["name"]
1107 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1109 # Extract video thumbnail
1110 video_thumbnail = config["video"]["thumbnail"]
1112 # Extract video description
1113 video_description = get_element_by_attribute("itemprop", "description", webpage)
1114 if video_description: video_description = clean_html(video_description)
1115 else: video_description = u''
1117 # Extract upload date
1118 video_upload_date = None
1119 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1120 if mobj is not None:
1121 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1123 # Vimeo specific: extract request signature and timestamp
1124 sig = config['request']['signature']
1125 timestamp = config['request']['timestamp']
1127 # Vimeo specific: extract video codec and quality information
1128 # First consider quality, then codecs, then take everything
1129 # TODO bind to format param
1130 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1131 files = { 'hd': [], 'sd': [], 'other': []}
1132 for codec_name, codec_extension in codecs:
1133 if codec_name in config["video"]["files"]:
1134 if 'hd' in config["video"]["files"][codec_name]:
1135 files['hd'].append((codec_name, codec_extension, 'hd'))
1136 elif 'sd' in config["video"]["files"][codec_name]:
1137 files['sd'].append((codec_name, codec_extension, 'sd'))
1139 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1141 for quality in ('hd', 'sd', 'other'):
1142 if len(files[quality]) > 0:
1143 video_quality = files[quality][0][2]
1144 video_codec = files[quality][0][0]
1145 video_extension = files[quality][0][1]
1146 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1149 self._downloader.report_error(u'no known codec found')
1152 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1153 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1158 'uploader': video_uploader,
1159 'uploader_id': video_uploader_id,
1160 'upload_date': video_upload_date,
1161 'title': video_title,
1162 'ext': video_extension,
1163 'thumbnail': video_thumbnail,
1164 'description': video_description,
1168 class ArteTvIE(InfoExtractor):
1169 """arte.tv information extractor."""
1171 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1172 _LIVE_URL = r'index-[0-9]+\.html$'
1174 IE_NAME = u'arte.tv'
1176 def fetch_webpage(self, url):
1177 request = compat_urllib_request.Request(url)
1179 self.report_download_webpage(url)
1180 webpage = compat_urllib_request.urlopen(request).read()
1181 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1182 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1184 except ValueError as err:
1185 self._downloader.report_error(u'Invalid URL: %s' % url)
1189 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1190 page = self.fetch_webpage(url)
1191 mobj = re.search(regex, page, regexFlags)
1195 self._downloader.report_error(u'Invalid URL: %s' % url)
1198 for (i, key, err) in matchTuples:
1199 if mobj.group(i) is None:
1200 self._downloader.report_error(err)
1203 info[key] = mobj.group(i)
1207 def extractLiveStream(self, url):
1208 video_lang = url.split('/')[-4]
1209 info = self.grep_webpage(
1211 r'src="(.*?/videothek_js.*?\.js)',
1214 (1, 'url', u'Invalid URL: %s' % url)
1217 http_host = url.split('/')[2]
1218 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1219 info = self.grep_webpage(
1221 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1222 '(http://.*?\.swf).*?' +
1226 (1, 'path', u'could not extract video path: %s' % url),
1227 (2, 'player', u'could not extract video player: %s' % url),
1228 (3, 'url', u'could not extract video url: %s' % url)
1231 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1233 def extractPlus7Stream(self, url):
1234 video_lang = url.split('/')[-3]
1235 info = self.grep_webpage(
1237 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1240 (1, 'url', u'Invalid URL: %s' % url)
1243 next_url = compat_urllib_parse.unquote(info.get('url'))
1244 info = self.grep_webpage(
1246 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1249 (1, 'url', u'Could not find <video> tag: %s' % url)
1252 next_url = compat_urllib_parse.unquote(info.get('url'))
1254 info = self.grep_webpage(
1256 r'<video id="(.*?)".*?>.*?' +
1257 '<name>(.*?)</name>.*?' +
1258 '<dateVideo>(.*?)</dateVideo>.*?' +
1259 '<url quality="hd">(.*?)</url>',
1262 (1, 'id', u'could not extract video id: %s' % url),
1263 (2, 'title', u'could not extract video title: %s' % url),
1264 (3, 'date', u'could not extract video date: %s' % url),
1265 (4, 'url', u'could not extract video url: %s' % url)
1270 'id': info.get('id'),
1271 'url': compat_urllib_parse.unquote(info.get('url')),
1272 'uploader': u'arte.tv',
1273 'upload_date': info.get('date'),
1274 'title': info.get('title').decode('utf-8'),
1280 def _real_extract(self, url):
1281 video_id = url.split('/')[-1]
1282 self.report_extraction(video_id)
1284 if re.search(self._LIVE_URL, video_id) is not None:
1285 self.extractLiveStream(url)
1288 info = self.extractPlus7Stream(url)
1293 class GenericIE(InfoExtractor):
1294 """Generic last-resort information extractor."""
1297 IE_NAME = u'generic'
1299 def report_download_webpage(self, video_id):
1300 """Report webpage download."""
1301 if not self._downloader.params.get('test', False):
1302 self._downloader.report_warning(u'Falling back on generic information extractor.')
1303 super(GenericIE, self).report_download_webpage(video_id)
1305 def report_following_redirect(self, new_url):
1306 """Report information extraction."""
1307 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1309 def _test_redirect(self, url):
1310 """Check if it is a redirect, like url shorteners, in case return the new url."""
1311 class HeadRequest(compat_urllib_request.Request):
1312 def get_method(self):
1315 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1317 Subclass the HTTPRedirectHandler to make it use our
1318 HeadRequest also on the redirected URL
1320 def redirect_request(self, req, fp, code, msg, headers, newurl):
1321 if code in (301, 302, 303, 307):
1322 newurl = newurl.replace(' ', '%20')
1323 newheaders = dict((k,v) for k,v in req.headers.items()
1324 if k.lower() not in ("content-length", "content-type"))
1325 return HeadRequest(newurl,
1327 origin_req_host=req.get_origin_req_host(),
1330 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1332 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1334 Fallback to GET if HEAD is not allowed (405 HTTP error)
1336 def http_error_405(self, req, fp, code, msg, headers):
1340 newheaders = dict((k,v) for k,v in req.headers.items()
1341 if k.lower() not in ("content-length", "content-type"))
1342 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1344 origin_req_host=req.get_origin_req_host(),
1348 opener = compat_urllib_request.OpenerDirector()
1349 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1350 HTTPMethodFallback, HEADRedirectHandler,
1351 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1352 opener.add_handler(handler())
1354 response = opener.open(HeadRequest(url))
1355 new_url = response.geturl()
1360 self.report_following_redirect(new_url)
1363 def _real_extract(self, url):
1364 new_url = self._test_redirect(url)
1365 if new_url: return [self.url_result(new_url)]
1367 video_id = url.split('/')[-1]
1369 webpage = self._download_webpage(url, video_id)
1370 except ValueError as err:
1371 # since this is the last-resort InfoExtractor, if
1372 # this error is thrown, it'll be thrown here
1373 self._downloader.report_error(u'Invalid URL: %s' % url)
1376 self.report_extraction(video_id)
1377 # Start with something easy: JW Player in SWFObject
1378 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1380 # Broaden the search a little bit
1381 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1383 # Broaden the search a little bit: JWPlayer JS loader
1384 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1386 self._downloader.report_error(u'Invalid URL: %s' % url)
1389 # It's possible that one of the regexes
1390 # matched, but returned an empty group:
1391 if mobj.group(1) is None:
1392 self._downloader.report_error(u'Invalid URL: %s' % url)
1395 video_url = compat_urllib_parse.unquote(mobj.group(1))
1396 video_id = os.path.basename(video_url)
1398 # here's a fun little line of code for you:
1399 video_extension = os.path.splitext(video_id)[1][1:]
1400 video_id = os.path.splitext(video_id)[0]
1402 # it's tempting to parse this further, but you would
1403 # have to take into account all the variations like
1404 # Video Title - Site Name
1405 # Site Name | Video Title
1406 # Video Title - Tagline | Site Name
1407 # and so on and so forth; it's just not practical
1408 mobj = re.search(r'<title>(.*)</title>', webpage)
1410 self._downloader.report_error(u'unable to extract title')
1412 video_title = mobj.group(1)
1414 # video uploader is domain name
1415 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1417 self._downloader.report_error(u'unable to extract title')
1419 video_uploader = mobj.group(1)
1424 'uploader': video_uploader,
1425 'upload_date': None,
1426 'title': video_title,
1427 'ext': video_extension,
1431 class YoutubeSearchIE(InfoExtractor):
1432 """Information Extractor for YouTube search queries."""
1433 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1434 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1435 _max_youtube_results = 1000
1436 IE_NAME = u'youtube:search'
1438 def report_download_page(self, query, pagenum):
1439 """Report attempt to download search page with given number."""
1440 query = query.decode(preferredencoding())
1441 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1443 def _real_extract(self, query):
1444 mobj = re.match(self._VALID_URL, query)
1446 self._downloader.report_error(u'invalid search query "%s"' % query)
1449 prefix, query = query.split(':')
1451 query = query.encode('utf-8')
1453 return self._get_n_results(query, 1)
1454 elif prefix == 'all':
1455 self._get_n_results(query, self._max_youtube_results)
1460 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1462 elif n > self._max_youtube_results:
1463 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1464 n = self._max_youtube_results
1465 return self._get_n_results(query, n)
1466 except ValueError: # parsing prefix as integer fails
1467 return self._get_n_results(query, 1)
1469 def _get_n_results(self, query, n):
1470 """Get a specified number of results for a query"""
1476 while (50 * pagenum) < limit:
1477 self.report_download_page(query, pagenum+1)
1478 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1479 request = compat_urllib_request.Request(result_url)
1481 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1482 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1483 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1485 api_response = json.loads(data)['data']
1487 if not 'items' in api_response:
1488 self._downloader.report_error(u'[youtube] No video results')
1491 new_ids = list(video['id'] for video in api_response['items'])
1492 video_ids += new_ids
1494 limit = min(n, api_response['totalItems'])
1497 if len(video_ids) > n:
1498 video_ids = video_ids[:n]
1499 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1503 class GoogleSearchIE(InfoExtractor):
1504 """Information Extractor for Google Video search queries."""
1505 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1506 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1507 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1508 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1509 _max_google_results = 1000
1510 IE_NAME = u'video.google:search'
1512 def report_download_page(self, query, pagenum):
1513 """Report attempt to download playlist page with given number."""
1514 query = query.decode(preferredencoding())
1515 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1517 def _real_extract(self, query):
1518 mobj = re.match(self._VALID_URL, query)
1520 self._downloader.report_error(u'invalid search query "%s"' % query)
1523 prefix, query = query.split(':')
1525 query = query.encode('utf-8')
1527 self._download_n_results(query, 1)
1529 elif prefix == 'all':
1530 self._download_n_results(query, self._max_google_results)
1536 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1538 elif n > self._max_google_results:
1539 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1540 n = self._max_google_results
1541 self._download_n_results(query, n)
1543 except ValueError: # parsing prefix as integer fails
1544 self._download_n_results(query, 1)
1547 def _download_n_results(self, query, n):
1548 """Downloads a specified number of results for a query"""
1554 self.report_download_page(query, pagenum)
1555 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1556 request = compat_urllib_request.Request(result_url)
1558 page = compat_urllib_request.urlopen(request).read()
1559 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1560 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1563 # Extract video identifiers
1564 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1565 video_id = mobj.group(1)
1566 if video_id not in video_ids:
1567 video_ids.append(video_id)
1568 if len(video_ids) == n:
1569 # Specified n videos reached
1570 for id in video_ids:
1571 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1574 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1575 for id in video_ids:
1576 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1579 pagenum = pagenum + 1
1582 class YahooSearchIE(InfoExtractor):
1583 """Information Extractor for Yahoo! Video search queries."""
1586 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1587 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1588 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1589 _MORE_PAGES_INDICATOR = r'\s*Next'
1590 _max_yahoo_results = 1000
1591 IE_NAME = u'video.yahoo:search'
1593 def report_download_page(self, query, pagenum):
1594 """Report attempt to download playlist page with given number."""
1595 query = query.decode(preferredencoding())
1596 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1598 def _real_extract(self, query):
1599 mobj = re.match(self._VALID_URL, query)
1601 self._downloader.report_error(u'invalid search query "%s"' % query)
1604 prefix, query = query.split(':')
1606 query = query.encode('utf-8')
1608 self._download_n_results(query, 1)
1610 elif prefix == 'all':
1611 self._download_n_results(query, self._max_yahoo_results)
1617 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1619 elif n > self._max_yahoo_results:
1620 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1621 n = self._max_yahoo_results
1622 self._download_n_results(query, n)
1624 except ValueError: # parsing prefix as integer fails
1625 self._download_n_results(query, 1)
1628 def _download_n_results(self, query, n):
1629 """Downloads a specified number of results for a query"""
1632 already_seen = set()
1636 self.report_download_page(query, pagenum)
1637 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1638 request = compat_urllib_request.Request(result_url)
1640 page = compat_urllib_request.urlopen(request).read()
1641 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1642 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1645 # Extract video identifiers
1646 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1647 video_id = mobj.group(1)
1648 if video_id not in already_seen:
1649 video_ids.append(video_id)
1650 already_seen.add(video_id)
1651 if len(video_ids) == n:
1652 # Specified n videos reached
1653 for id in video_ids:
1654 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1657 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1658 for id in video_ids:
1659 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1662 pagenum = pagenum + 1
1665 class YoutubePlaylistIE(InfoExtractor):
1666 """Information Extractor for YouTube playlists."""
1668 _VALID_URL = r"""(?:
1673 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1674 \? (?:.*?&)*? (?:p|a|list)=
1677 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1680 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1682 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1684 IE_NAME = u'youtube:playlist'
1687 def suitable(cls, url):
1688 """Receives a URL and returns True if suitable for this IE."""
1689 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1691 def report_download_page(self, playlist_id, pagenum):
1692 """Report attempt to download playlist page with given number."""
1693 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1695 def _real_extract(self, url):
1696 # Extract playlist id
1697 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1699 self._downloader.report_error(u'invalid url: %s' % url)
1702 # Download playlist videos from API
1703 playlist_id = mobj.group(1) or mobj.group(2)
1708 self.report_download_page(playlist_id, page_num)
1710 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1712 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1713 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1714 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1718 response = json.loads(page)
1719 except ValueError as err:
1720 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1723 if 'feed' not in response:
1724 self._downloader.report_error(u'Got a malformed response from YouTube API')
1726 if 'entry' not in response['feed']:
1727 # Number of videos is a multiple of self._MAX_RESULTS
1730 playlist_title = response['feed']['title']['$t']
1732 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1733 for entry in response['feed']['entry']
1734 if 'content' in entry ]
1736 if len(response['feed']['entry']) < self._MAX_RESULTS:
1740 videos = [v[1] for v in sorted(videos)]
1742 url_results = [self.url_result(url, 'Youtube') for url in videos]
1743 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1746 class YoutubeChannelIE(InfoExtractor):
1747 """Information Extractor for YouTube channels."""
1749 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1750 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1751 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1752 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1753 IE_NAME = u'youtube:channel'
1755 def report_download_page(self, channel_id, pagenum):
1756 """Report attempt to download channel page with given number."""
1757 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1759 def extract_videos_from_page(self, page):
1761 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1762 if mobj.group(1) not in ids_in_page:
1763 ids_in_page.append(mobj.group(1))
1766 def _real_extract(self, url):
1767 # Extract channel id
1768 mobj = re.match(self._VALID_URL, url)
1770 self._downloader.report_error(u'invalid url: %s' % url)
1773 # Download channel page
1774 channel_id = mobj.group(1)
1778 self.report_download_page(channel_id, pagenum)
1779 url = self._TEMPLATE_URL % (channel_id, pagenum)
1780 request = compat_urllib_request.Request(url)
1782 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1783 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1784 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1787 # Extract video identifiers
1788 ids_in_page = self.extract_videos_from_page(page)
1789 video_ids.extend(ids_in_page)
1791 # Download any subsequent channel pages using the json-based channel_ajax query
1792 if self._MORE_PAGES_INDICATOR in page:
1794 pagenum = pagenum + 1
1796 self.report_download_page(channel_id, pagenum)
1797 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1798 request = compat_urllib_request.Request(url)
1800 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1801 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1802 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1805 page = json.loads(page)
1807 ids_in_page = self.extract_videos_from_page(page['content_html'])
1808 video_ids.extend(ids_in_page)
1810 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1813 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1815 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1816 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1817 return [self.playlist_result(url_entries, channel_id)]
1820 class YoutubeUserIE(InfoExtractor):
1821 """Information Extractor for YouTube users."""
1823 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1824 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1825 _GDATA_PAGE_SIZE = 50
1826 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1827 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1828 IE_NAME = u'youtube:user'
1830 def report_download_page(self, username, start_index):
1831 """Report attempt to download user page."""
1832 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1833 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1835 def _real_extract(self, url):
1837 mobj = re.match(self._VALID_URL, url)
1839 self._downloader.report_error(u'invalid url: %s' % url)
1842 username = mobj.group(1)
1844 # Download video ids using YouTube Data API. Result size per
1845 # query is limited (currently to 50 videos) so we need to query
1846 # page by page until there are no video ids - it means we got
1853 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1854 self.report_download_page(username, start_index)
1856 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1859 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1860 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1861 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1864 # Extract video identifiers
1867 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1868 if mobj.group(1) not in ids_in_page:
1869 ids_in_page.append(mobj.group(1))
1871 video_ids.extend(ids_in_page)
1873 # A little optimization - if current page is not
1874 # "full", ie. does not contain PAGE_SIZE video ids then
1875 # we can assume that this page is the last one - there
1876 # are no more ids on further pages - no need to query
1879 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1884 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1885 url_results = [self.url_result(url, 'Youtube') for url in urls]
1886 return [self.playlist_result(url_results, playlist_title = username)]
1889 class BlipTVUserIE(InfoExtractor):
1890 """Information Extractor for blip.tv users."""
1892 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1894 IE_NAME = u'blip.tv:user'
1896 def report_download_page(self, username, pagenum):
1897 """Report attempt to download user page."""
1898 self.to_screen(u'user %s: Downloading video ids from page %d' %
1899 (username, pagenum))
1901 def _real_extract(self, url):
1903 mobj = re.match(self._VALID_URL, url)
1905 self._downloader.report_error(u'invalid url: %s' % url)
1908 username = mobj.group(1)
1910 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1912 request = compat_urllib_request.Request(url)
1915 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1916 mobj = re.search(r'data-users-id="([^"]+)"', page)
1917 page_base = page_base % mobj.group(1)
1918 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1919 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1923 # Download video ids using BlipTV Ajax calls. Result size per
1924 # query is limited (currently to 12 videos) so we need to query
1925 # page by page until there are no video ids - it means we got
1932 self.report_download_page(username, pagenum)
1933 url = page_base + "&page=" + str(pagenum)
1934 request = compat_urllib_request.Request( url )
1936 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1937 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1938 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
1941 # Extract video identifiers
1944 for mobj in re.finditer(r'href="/([^"]+)"', page):
1945 if mobj.group(1) not in ids_in_page:
1946 ids_in_page.append(unescapeHTML(mobj.group(1)))
1948 video_ids.extend(ids_in_page)
1950 # A little optimization - if current page is not
1951 # "full", ie. does not contain PAGE_SIZE video ids then
1952 # we can assume that this page is the last one - there
1953 # are no more ids on further pages - no need to query
1956 if len(ids_in_page) < self._PAGE_SIZE:
1961 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1962 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1963 return [self.playlist_result(url_entries, playlist_title = username)]
1966 class DepositFilesIE(InfoExtractor):
1967 """Information extractor for depositfiles.com"""
1969 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1971 def _real_extract(self, url):
1972 file_id = url.split('/')[-1]
1973 # Rebuild url in english locale
1974 url = 'http://depositfiles.com/en/files/' + file_id
1976 # Retrieve file webpage with 'Free download' button pressed
1977 free_download_indication = { 'gateway_result' : '1' }
1978 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1980 self.report_download_webpage(file_id)
1981 webpage = compat_urllib_request.urlopen(request).read()
1982 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1983 self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
1986 # Search for the real file URL
1987 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1988 if (mobj is None) or (mobj.group(1) is None):
1989 # Try to figure out reason of the error.
1990 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1991 if (mobj is not None) and (mobj.group(1) is not None):
1992 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1993 self._downloader.report_error(u'%s' % restriction_message)
1995 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
1998 file_url = mobj.group(1)
1999 file_extension = os.path.splitext(file_url)[1][1:]
2001 # Search for file title
2002 mobj = re.search(r'<b title="(.*?)">', webpage)
2004 self._downloader.report_error(u'unable to extract title')
2006 file_title = mobj.group(1).decode('utf-8')
2009 'id': file_id.decode('utf-8'),
2010 'url': file_url.decode('utf-8'),
2012 'upload_date': None,
2013 'title': file_title,
2014 'ext': file_extension.decode('utf-8'),
2018 class FacebookIE(InfoExtractor):
2019 """Information Extractor for Facebook"""
2021 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2022 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2023 _NETRC_MACHINE = 'facebook'
2024 IE_NAME = u'facebook'
2026 def report_login(self):
2027 """Report attempt to log in."""
2028 self.to_screen(u'Logging in')
2030 def _real_initialize(self):
2031 if self._downloader is None:
2036 downloader_params = self._downloader.params
2038 # Attempt to use provided username and password or .netrc data
2039 if downloader_params.get('username', None) is not None:
2040 useremail = downloader_params['username']
2041 password = downloader_params['password']
2042 elif downloader_params.get('usenetrc', False):
2044 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2045 if info is not None:
2049 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2050 except (IOError, netrc.NetrcParseError) as err:
2051 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2054 if useremail is None:
2063 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2066 login_results = compat_urllib_request.urlopen(request).read()
2067 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2068 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2070 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2071 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2074 def _real_extract(self, url):
2075 mobj = re.match(self._VALID_URL, url)
2077 self._downloader.report_error(u'invalid URL: %s' % url)
2079 video_id = mobj.group('ID')
2081 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2082 webpage = self._download_webpage(url, video_id)
2084 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2085 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2086 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2088 raise ExtractorError(u'Cannot parse data')
2089 data = dict(json.loads(m.group(1)))
2090 params_raw = compat_urllib_parse.unquote(data['params'])
2091 params = json.loads(params_raw)
2092 video_data = params['video_data'][0]
2093 video_url = video_data.get('hd_src')
2095 video_url = video_data['sd_src']
2097 raise ExtractorError(u'Cannot find video URL')
2098 video_duration = int(video_data['video_duration'])
2099 thumbnail = video_data['thumbnail_src']
2101 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2103 raise ExtractorError(u'Cannot find title in webpage')
2104 video_title = unescapeHTML(m.group(1))
2108 'title': video_title,
2111 'duration': video_duration,
2112 'thumbnail': thumbnail,
2117 class BlipTVIE(InfoExtractor):
2118 """Information extractor for blip.tv"""
2120 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2121 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2122 IE_NAME = u'blip.tv'
2124 def report_direct_download(self, title):
2125 """Report information extraction."""
2126 self.to_screen(u'%s: Direct download detected' % title)
2128 def _real_extract(self, url):
2129 mobj = re.match(self._VALID_URL, url)
2131 self._downloader.report_error(u'invalid URL: %s' % url)
2134 urlp = compat_urllib_parse_urlparse(url)
2135 if urlp.path.startswith('/play/'):
2136 request = compat_urllib_request.Request(url)
2137 response = compat_urllib_request.urlopen(request)
2138 redirecturl = response.geturl()
2139 rurlp = compat_urllib_parse_urlparse(redirecturl)
2140 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2141 url = 'http://blip.tv/a/a-' + file_id
2142 return self._real_extract(url)
2149 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2150 request = compat_urllib_request.Request(json_url)
2151 request.add_header('User-Agent', 'iTunes/10.6.1')
2152 self.report_extraction(mobj.group(1))
2155 urlh = compat_urllib_request.urlopen(request)
2156 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2157 basename = url.split('/')[-1]
2158 title,ext = os.path.splitext(basename)
2159 title = title.decode('UTF-8')
2160 ext = ext.replace('.', '')
2161 self.report_direct_download(title)
2166 'upload_date': None,
2171 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2172 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2173 if info is None: # Regular URL
2175 json_code_bytes = urlh.read()
2176 json_code = json_code_bytes.decode('utf-8')
2177 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2178 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2182 json_data = json.loads(json_code)
2183 if 'Post' in json_data:
2184 data = json_data['Post']
2188 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2189 video_url = data['media']['url']
2190 umobj = re.match(self._URL_EXT, video_url)
2192 raise ValueError('Can not determine filename extension')
2193 ext = umobj.group(1)
2196 'id': data['item_id'],
2198 'uploader': data['display_name'],
2199 'upload_date': upload_date,
2200 'title': data['title'],
2202 'format': data['media']['mimeType'],
2203 'thumbnail': data['thumbnailUrl'],
2204 'description': data['description'],
2205 'player_url': data['embedUrl'],
2206 'user_agent': 'iTunes/10.6.1',
2208 except (ValueError,KeyError) as err:
2209 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2215 class MyVideoIE(InfoExtractor):
2216 """Information Extractor for myvideo.de."""
2218 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2219 IE_NAME = u'myvideo'
2221 def _real_extract(self,url):
2222 mobj = re.match(self._VALID_URL, url)
2224 self._download.report_error(u'invalid URL: %s' % url)
2227 video_id = mobj.group(1)
2230 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2231 webpage = self._download_webpage(webpage_url, video_id)
2233 self.report_extraction(video_id)
2234 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2237 self._downloader.report_error(u'unable to extract media URL')
2239 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2241 mobj = re.search('<title>([^<]+)</title>', webpage)
2243 self._downloader.report_error(u'unable to extract title')
2246 video_title = mobj.group(1)
2252 'upload_date': None,
2253 'title': video_title,
2257 class ComedyCentralIE(InfoExtractor):
2258 """Information extractor for The Daily Show and Colbert Report """
2260 # urls can be abbreviations like :thedailyshow or :colbert
2261 # urls for episodes like:
2262 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2263 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2264 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2265 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2266 |(https?://)?(www\.)?
2267 (?P<showname>thedailyshow|colbertnation)\.com/
2268 (full-episodes/(?P<episode>.*)|
2270 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2271 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2274 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2276 _video_extensions = {
2284 _video_dimensions = {
2294 def suitable(cls, url):
2295 """Receives a URL and returns True if suitable for this IE."""
2296 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2298 def report_config_download(self, episode_id, media_id):
2299 self.to_screen(u'%s: Downloading configuration for %s' % (episode_id, media_id))
2301 def report_index_download(self, episode_id):
2302 self.to_screen(u'%s: Downloading show index' % episode_id)
2304 def _print_formats(self, formats):
2305 print('Available formats:')
2307 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2310 def _real_extract(self, url):
2311 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2313 self._downloader.report_error(u'invalid URL: %s' % url)
2316 if mobj.group('shortname'):
2317 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2318 url = u'http://www.thedailyshow.com/full-episodes/'
2320 url = u'http://www.colbertnation.com/full-episodes/'
2321 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2322 assert mobj is not None
2324 if mobj.group('clip'):
2325 if mobj.group('showname') == 'thedailyshow':
2326 epTitle = mobj.group('tdstitle')
2328 epTitle = mobj.group('cntitle')
2331 dlNewest = not mobj.group('episode')
2333 epTitle = mobj.group('showname')
2335 epTitle = mobj.group('episode')
2337 req = compat_urllib_request.Request(url)
2338 self.report_extraction(epTitle)
2340 htmlHandle = compat_urllib_request.urlopen(req)
2341 html = htmlHandle.read()
2342 webpage = html.decode('utf-8')
2343 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2344 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2347 url = htmlHandle.geturl()
2348 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2350 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2352 if mobj.group('episode') == '':
2353 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2355 epTitle = mobj.group('episode')
2357 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2359 if len(mMovieParams) == 0:
2360 # The Colbert Report embeds the information in a without
2361 # a URL prefix; so extract the alternate reference
2362 # and then add the URL prefix manually.
2364 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2365 if len(altMovieParams) == 0:
2366 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2369 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2371 uri = mMovieParams[0][1]
2372 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2373 self.report_index_download(epTitle)
2375 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2376 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2377 self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2382 idoc = xml.etree.ElementTree.fromstring(indexXml)
2383 itemEls = idoc.findall('.//item')
2384 for partNum,itemEl in enumerate(itemEls):
2385 mediaId = itemEl.findall('./guid')[0].text
2386 shortMediaId = mediaId.split(':')[-1]
2387 showId = mediaId.split(':')[-2].replace('.com', '')
2388 officialTitle = itemEl.findall('./title')[0].text
2389 officialDate = itemEl.findall('./pubDate')[0].text
2391 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2392 compat_urllib_parse.urlencode({'uri': mediaId}))
2393 configReq = compat_urllib_request.Request(configUrl)
2394 self.report_config_download(epTitle, shortMediaId)
2396 configXml = compat_urllib_request.urlopen(configReq).read()
2397 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2398 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2401 cdoc = xml.etree.ElementTree.fromstring(configXml)
2403 for rendition in cdoc.findall('.//rendition'):
2404 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2408 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2411 if self._downloader.params.get('listformats', None):
2412 self._print_formats([i[0] for i in turls])
2415 # For now, just pick the highest bitrate
2416 format,rtmp_video_url = turls[-1]
2418 # Get the format arg from the arg stream
2419 req_format = self._downloader.params.get('format', None)
2421 # Select format if we can find one
2424 format, rtmp_video_url = f, v
2427 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2429 raise ExtractorError(u'Cannot transform RTMP url')
2430 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2431 video_url = base + m.group('finalid')
2433 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2438 'upload_date': officialDate,
2443 'description': officialTitle,
2445 results.append(info)
2450 class EscapistIE(InfoExtractor):
2451 """Information extractor for The Escapist """
2453 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2454 IE_NAME = u'escapist'
2456 def report_config_download(self, showName):
2457 self.to_screen(u'%s: Downloading configuration' % showName)
2459 def _real_extract(self, url):
2460 mobj = re.match(self._VALID_URL, url)
2462 self._downloader.report_error(u'invalid URL: %s' % url)
2464 showName = mobj.group('showname')
2465 videoId = mobj.group('episode')
2467 self.report_extraction(showName)
2469 webPage = compat_urllib_request.urlopen(url)
2470 webPageBytes = webPage.read()
2471 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2472 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2473 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2474 self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2477 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2478 description = unescapeHTML(descMatch.group(1))
2479 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2480 imgUrl = unescapeHTML(imgMatch.group(1))
2481 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2482 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2483 configUrlMatch = re.search('config=(.*)$', playerUrl)
2484 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2486 self.report_config_download(showName)
2488 configJSON = compat_urllib_request.urlopen(configUrl)
2489 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2490 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2491 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2492 self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2495 # Technically, it's JavaScript, not JSON
2496 configJSON = configJSON.replace("'", '"')
2499 config = json.loads(configJSON)
2500 except (ValueError,) as err:
2501 self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2504 playlist = config['playlist']
2505 videoUrl = playlist[1]['url']
2510 'uploader': showName,
2511 'upload_date': None,
2514 'thumbnail': imgUrl,
2515 'description': description,
2516 'player_url': playerUrl,
2521 class CollegeHumorIE(InfoExtractor):
2522 """Information extractor for collegehumor.com"""
2525 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2526 IE_NAME = u'collegehumor'
2528 def report_manifest(self, video_id):
2529 """Report information extraction."""
2530 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2532 def _real_extract(self, url):
2533 mobj = re.match(self._VALID_URL, url)
2535 self._downloader.report_error(u'invalid URL: %s' % url)
2537 video_id = mobj.group('videoid')
2542 'upload_date': None,
2545 self.report_extraction(video_id)
2546 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2548 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2549 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2550 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2553 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2555 videoNode = mdoc.findall('./video')[0]
2556 info['description'] = videoNode.findall('./description')[0].text
2557 info['title'] = videoNode.findall('./caption')[0].text
2558 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2559 manifest_url = videoNode.findall('./file')[0].text
2561 self._downloader.report_error(u'Invalid metadata XML file')
2564 manifest_url += '?hdcore=2.10.3'
2565 self.report_manifest(video_id)
2567 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2568 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2569 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2572 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2574 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2575 node_id = media_node.attrib['url']
2576 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2577 except IndexError as err:
2578 self._downloader.report_error(u'Invalid manifest file')
2581 url_pr = compat_urllib_parse_urlparse(manifest_url)
2582 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2589 class XVideosIE(InfoExtractor):
2590 """Information extractor for xvideos.com"""
2592 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2593 IE_NAME = u'xvideos'
2595 def _real_extract(self, url):
2596 mobj = re.match(self._VALID_URL, url)
2598 self._downloader.report_error(u'invalid URL: %s' % url)
2600 video_id = mobj.group(1)
2602 webpage = self._download_webpage(url, video_id)
2604 self.report_extraction(video_id)
2608 mobj = re.search(r'flv_url=(.+?)&', webpage)
2610 self._downloader.report_error(u'unable to extract video url')
2612 video_url = compat_urllib_parse.unquote(mobj.group(1))
2616 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2618 self._downloader.report_error(u'unable to extract video title')
2620 video_title = mobj.group(1)
2623 # Extract video thumbnail
2624 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2626 self._downloader.report_error(u'unable to extract video thumbnail')
2628 video_thumbnail = mobj.group(0)
2634 'upload_date': None,
2635 'title': video_title,
2637 'thumbnail': video_thumbnail,
2638 'description': None,
2644 class SoundcloudIE(InfoExtractor):
2645 """Information extractor for soundcloud.com
2646 To access the media, the uid of the song and a stream token
2647 must be extracted from the page source and the script must make
2648 a request to media.soundcloud.com/crossdomain.xml. Then
2649 the media can be grabbed by requesting from an url composed
2650 of the stream token and uid
2653 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2654 IE_NAME = u'soundcloud'
2656 def report_resolve(self, video_id):
2657 """Report information extraction."""
2658 self.to_screen(u'%s: Resolving id' % video_id)
2660 def _real_extract(self, url):
2661 mobj = re.match(self._VALID_URL, url)
2663 self._downloader.report_error(u'invalid URL: %s' % url)
2666 # extract uploader (which is in the url)
2667 uploader = mobj.group(1)
2668 # extract simple title (uploader + slug of song title)
2669 slug_title = mobj.group(2)
2670 simple_title = uploader + u'-' + slug_title
2672 self.report_resolve('%s/%s' % (uploader, slug_title))
2674 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2675 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2676 request = compat_urllib_request.Request(resolv_url)
2678 info_json_bytes = compat_urllib_request.urlopen(request).read()
2679 info_json = info_json_bytes.decode('utf-8')
2680 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2681 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2684 info = json.loads(info_json)
2685 video_id = info['id']
2686 self.report_extraction('%s/%s' % (uploader, slug_title))
2688 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2689 request = compat_urllib_request.Request(streams_url)
2691 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2692 stream_json = stream_json_bytes.decode('utf-8')
2693 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2694 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2697 streams = json.loads(stream_json)
2698 mediaURL = streams['http_mp3_128_url']
2703 'uploader': info['user']['username'],
2704 'upload_date': info['created_at'],
2705 'title': info['title'],
2707 'description': info['description'],
2710 class SoundcloudSetIE(InfoExtractor):
2711 """Information extractor for soundcloud.com sets
2712 To access the media, the uid of the song and a stream token
2713 must be extracted from the page source and the script must make
2714 a request to media.soundcloud.com/crossdomain.xml. Then
2715 the media can be grabbed by requesting from an url composed
2716 of the stream token and uid
2719 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2720 IE_NAME = u'soundcloud:set'
2722 def report_resolve(self, video_id):
2723 """Report information extraction."""
2724 self.to_screen(u'%s: Resolving id' % video_id)
2726 def _real_extract(self, url):
2727 mobj = re.match(self._VALID_URL, url)
2729 self._downloader.report_error(u'invalid URL: %s' % url)
2732 # extract uploader (which is in the url)
2733 uploader = mobj.group(1)
2734 # extract simple title (uploader + slug of song title)
2735 slug_title = mobj.group(2)
2736 simple_title = uploader + u'-' + slug_title
2738 self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2740 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2741 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2742 request = compat_urllib_request.Request(resolv_url)
2744 info_json_bytes = compat_urllib_request.urlopen(request).read()
2745 info_json = info_json_bytes.decode('utf-8')
2746 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2747 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2751 info = json.loads(info_json)
2752 if 'errors' in info:
2753 for err in info['errors']:
2754 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2757 for track in info['tracks']:
2758 video_id = track['id']
2759 self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2761 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2762 request = compat_urllib_request.Request(streams_url)
2764 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2765 stream_json = stream_json_bytes.decode('utf-8')
2766 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2767 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2770 streams = json.loads(stream_json)
2771 mediaURL = streams['http_mp3_128_url']
2776 'uploader': track['user']['username'],
2777 'upload_date': track['created_at'],
2778 'title': track['title'],
2780 'description': track['description'],
2785 class InfoQIE(InfoExtractor):
2786 """Information extractor for infoq.com"""
2787 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2789 def _real_extract(self, url):
2790 mobj = re.match(self._VALID_URL, url)
2792 self._downloader.report_error(u'invalid URL: %s' % url)
2795 webpage = self._download_webpage(url, video_id=url)
2796 self.report_extraction(url)
2799 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2801 self._downloader.report_error(u'unable to extract video url')
2803 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2804 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2807 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2809 self._downloader.report_error(u'unable to extract video title')
2811 video_title = mobj.group(1)
2813 # Extract description
2814 video_description = u'No description available.'
2815 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2816 if mobj is not None:
2817 video_description = mobj.group(1)
2819 video_filename = video_url.split('/')[-1]
2820 video_id, extension = video_filename.split('.')
2826 'upload_date': None,
2827 'title': video_title,
2828 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2830 'description': video_description,
2835 class MixcloudIE(InfoExtractor):
2836 """Information extractor for www.mixcloud.com"""
2838 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2839 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2840 IE_NAME = u'mixcloud'
2842 def report_download_json(self, file_id):
2843 """Report JSON download."""
2844 self.to_screen(u'Downloading json')
2846 def get_urls(self, jsonData, fmt, bitrate='best'):
2847 """Get urls from 'audio_formats' section in json"""
2850 bitrate_list = jsonData[fmt]
2851 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2852 bitrate = max(bitrate_list) # select highest
2854 url_list = jsonData[fmt][bitrate]
2855 except TypeError: # we have no bitrate info.
2856 url_list = jsonData[fmt]
2859 def check_urls(self, url_list):
2860 """Returns 1st active url from list"""
2861 for url in url_list:
2863 compat_urllib_request.urlopen(url)
2865 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2870 def _print_formats(self, formats):
2871 print('Available formats:')
2872 for fmt in formats.keys():
2873 for b in formats[fmt]:
2875 ext = formats[fmt][b][0]
2876 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2877 except TypeError: # we have no bitrate info
2878 ext = formats[fmt][0]
2879 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2882 def _real_extract(self, url):
2883 mobj = re.match(self._VALID_URL, url)
2885 self._downloader.report_error(u'invalid URL: %s' % url)
2887 # extract uploader & filename from url
2888 uploader = mobj.group(1).decode('utf-8')
2889 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2891 # construct API request
2892 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2893 # retrieve .json file with links to files
2894 request = compat_urllib_request.Request(file_url)
2896 self.report_download_json(file_url)
2897 jsonData = compat_urllib_request.urlopen(request).read()
2898 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2899 self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
2903 json_data = json.loads(jsonData)
2904 player_url = json_data['player_swf_url']
2905 formats = dict(json_data['audio_formats'])
2907 req_format = self._downloader.params.get('format', None)
2910 if self._downloader.params.get('listformats', None):
2911 self._print_formats(formats)
2914 if req_format is None or req_format == 'best':
2915 for format_param in formats.keys():
2916 url_list = self.get_urls(formats, format_param)
2918 file_url = self.check_urls(url_list)
2919 if file_url is not None:
2922 if req_format not in formats:
2923 self._downloader.report_error(u'format is not available')
2926 url_list = self.get_urls(formats, req_format)
2927 file_url = self.check_urls(url_list)
2928 format_param = req_format
2931 'id': file_id.decode('utf-8'),
2932 'url': file_url.decode('utf-8'),
2933 'uploader': uploader.decode('utf-8'),
2934 'upload_date': None,
2935 'title': json_data['name'],
2936 'ext': file_url.split('.')[-1].decode('utf-8'),
2937 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2938 'thumbnail': json_data['thumbnail_url'],
2939 'description': json_data['description'],
2940 'player_url': player_url.decode('utf-8'),
2943 class StanfordOpenClassroomIE(InfoExtractor):
2944 """Information extractor for Stanford's Open ClassRoom"""
2946 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2947 IE_NAME = u'stanfordoc'
2949 def _real_extract(self, url):
2950 mobj = re.match(self._VALID_URL, url)
2952 raise ExtractorError(u'Invalid URL: %s' % url)
2954 if mobj.group('course') and mobj.group('video'): # A specific video
2955 course = mobj.group('course')
2956 video = mobj.group('video')
2958 'id': course + '_' + video,
2960 'upload_date': None,
2963 self.report_extraction(info['id'])
2964 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2965 xmlUrl = baseUrl + video + '.xml'
2967 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2968 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2969 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2971 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2973 info['title'] = mdoc.findall('./title')[0].text
2974 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2976 self._downloader.report_error(u'Invalid metadata XML file')
2978 info['ext'] = info['url'].rpartition('.')[2]
2980 elif mobj.group('course'): # A course page
2981 course = mobj.group('course')
2986 'upload_date': None,
2989 coursepage = self._download_webpage(url, info['id'],
2990 note='Downloading course info page',
2991 errnote='Unable to download course info page')
2993 m = re.search('<h1>([^<]+)</h1>', coursepage)
2995 info['title'] = unescapeHTML(m.group(1))
2997 info['title'] = info['id']
2999 m = re.search('<description>([^<]+)</description>', coursepage)
3001 info['description'] = unescapeHTML(m.group(1))
3003 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3006 'type': 'reference',
3007 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3011 for entry in info['list']:
3012 assert entry['type'] == 'reference'
3013 results += self.extract(entry['url'])
3017 'id': 'Stanford OpenClassroom',
3020 'upload_date': None,
3023 self.report_download_webpage(info['id'])
3024 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3026 rootpage = compat_urllib_request.urlopen(rootURL).read()
3027 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3028 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3031 info['title'] = info['id']
3033 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3036 'type': 'reference',
3037 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3042 for entry in info['list']:
3043 assert entry['type'] == 'reference'
3044 results += self.extract(entry['url'])
3047 class MTVIE(InfoExtractor):
3048 """Information extractor for MTV.com"""
3050 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3053 def _real_extract(self, url):
3054 mobj = re.match(self._VALID_URL, url)
3056 self._downloader.report_error(u'invalid URL: %s' % url)
3058 if not mobj.group('proto'):
3059 url = 'http://' + url
3060 video_id = mobj.group('videoid')
3062 webpage = self._download_webpage(url, video_id)
3064 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3066 self._downloader.report_error(u'unable to extract song name')
3068 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3069 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3071 self._downloader.report_error(u'unable to extract performer')
3073 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3074 video_title = performer + ' - ' + song_name
3076 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3078 self._downloader.report_error(u'unable to mtvn_uri')
3080 mtvn_uri = mobj.group(1)
3082 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3084 self._downloader.report_error(u'unable to extract content id')
3086 content_id = mobj.group(1)
3088 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3089 self.report_extraction(video_id)
3090 request = compat_urllib_request.Request(videogen_url)
3092 metadataXml = compat_urllib_request.urlopen(request).read()
3093 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3094 self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3097 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3098 renditions = mdoc.findall('.//rendition')
3100 # For now, always pick the highest quality.
3101 rendition = renditions[-1]
3104 _,_,ext = rendition.attrib['type'].partition('/')
3105 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3106 video_url = rendition.find('./src').text
3108 self._downloader.report_error('Invalid rendition field.')
3114 'uploader': performer,
3115 'upload_date': None,
3116 'title': video_title,
3124 class YoukuIE(InfoExtractor):
3125 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3128 nowTime = int(time.time() * 1000)
3129 random1 = random.randint(1000,1998)
3130 random2 = random.randint(1000,9999)
3132 return "%d%d%d" %(nowTime,random1,random2)
3134 def _get_file_ID_mix_string(self, seed):
3136 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3138 for i in range(len(source)):
3139 seed = (seed * 211 + 30031 ) % 65536
3140 index = math.floor(seed / 65536 * len(source) )
3141 mixed.append(source[int(index)])
3142 source.remove(source[int(index)])
3143 #return ''.join(mixed)
3146 def _get_file_id(self, fileId, seed):
3147 mixed = self._get_file_ID_mix_string(seed)
3148 ids = fileId.split('*')
3152 realId.append(mixed[int(ch)])
3153 return ''.join(realId)
3155 def _real_extract(self, url):
3156 mobj = re.match(self._VALID_URL, url)
3158 self._downloader.report_error(u'invalid URL: %s' % url)
3160 video_id = mobj.group('ID')
3162 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3164 request = compat_urllib_request.Request(info_url, None, std_headers)
3166 self.report_download_webpage(video_id)
3167 jsondata = compat_urllib_request.urlopen(request).read()
3168 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3169 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3172 self.report_extraction(video_id)
3174 jsonstr = jsondata.decode('utf-8')
3175 config = json.loads(jsonstr)
3177 video_title = config['data'][0]['title']
3178 seed = config['data'][0]['seed']
3180 format = self._downloader.params.get('format', None)
3181 supported_format = list(config['data'][0]['streamfileids'].keys())
3183 if format is None or format == 'best':
3184 if 'hd2' in supported_format:
3189 elif format == 'worst':
3197 fileid = config['data'][0]['streamfileids'][format]
3198 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3199 except (UnicodeDecodeError, ValueError, KeyError):
3200 self._downloader.report_error(u'unable to extract info section')
3204 sid = self._gen_sid()
3205 fileid = self._get_file_id(fileid, seed)
3207 #column 8,9 of fileid represent the segment number
3208 #fileid[7:9] should be changed
3209 for index, key in enumerate(keys):
3211 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3212 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3215 'id': '%s_part%02d' % (video_id, index),
3216 'url': download_url,
3218 'upload_date': None,
3219 'title': video_title,
3222 files_info.append(info)
3227 class XNXXIE(InfoExtractor):
3228 """Information extractor for xnxx.com"""
3230 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3232 VIDEO_URL_RE = r'flv_url=(.*?)&'
3233 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3234 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3236 def _real_extract(self, url):
3237 mobj = re.match(self._VALID_URL, url)
3239 self._downloader.report_error(u'invalid URL: %s' % url)
3241 video_id = mobj.group(1)
3243 self.report_download_webpage(video_id)
3245 # Get webpage content
3247 webpage_bytes = compat_urllib_request.urlopen(url).read()
3248 webpage = webpage_bytes.decode('utf-8')
3249 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3250 self._downloader.report_error(u'unable to download video webpage: %s' % err)
3253 result = re.search(self.VIDEO_URL_RE, webpage)
3255 self._downloader.report_error(u'unable to extract video url')
3257 video_url = compat_urllib_parse.unquote(result.group(1))
3259 result = re.search(self.VIDEO_TITLE_RE, webpage)
3261 self._downloader.report_error(u'unable to extract video title')
3263 video_title = result.group(1)
3265 result = re.search(self.VIDEO_THUMB_RE, webpage)
3267 self._downloader.report_error(u'unable to extract video thumbnail')
3269 video_thumbnail = result.group(1)
3275 'upload_date': None,
3276 'title': video_title,
3278 'thumbnail': video_thumbnail,
3279 'description': None,
3283 class GooglePlusIE(InfoExtractor):
3284 """Information extractor for plus.google.com."""
3286 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3287 IE_NAME = u'plus.google'
3289 def report_extract_entry(self, url):
3290 """Report downloading extry"""
3291 self.to_screen(u'Downloading entry: %s' % url)
3293 def report_date(self, upload_date):
3294 """Report downloading extry"""
3295 self.to_screen(u'Entry date: %s' % upload_date)
3297 def report_uploader(self, uploader):
3298 """Report downloading extry"""
3299 self.to_screen(u'Uploader: %s' % uploader)
3301 def report_title(self, video_title):
3302 """Report downloading extry"""
3303 self.to_screen(u'Title: %s' % video_title)
3305 def report_extract_vid_page(self, video_page):
3306 """Report information extraction."""
3307 self.to_screen(u'Extracting video page: %s' % video_page)
3309 def _real_extract(self, url):
3310 # Extract id from URL
3311 mobj = re.match(self._VALID_URL, url)
3313 self._downloader.report_error(u'Invalid URL: %s' % url)
3316 post_url = mobj.group(0)
3317 video_id = mobj.group(1)
3319 video_extension = 'flv'
3321 # Step 1, Retrieve post webpage to extract further information
3322 self.report_extract_entry(post_url)
3323 request = compat_urllib_request.Request(post_url)
3325 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3326 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3327 self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3330 # Extract update date
3332 pattern = 'title="Timestamp">(.*?)</a>'
3333 mobj = re.search(pattern, webpage)
3335 upload_date = mobj.group(1)
3336 # Convert timestring to a format suitable for filename
3337 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3338 upload_date = upload_date.strftime('%Y%m%d')
3339 self.report_date(upload_date)
3343 pattern = r'rel\="author".*?>(.*?)</a>'
3344 mobj = re.search(pattern, webpage)
3346 uploader = mobj.group(1)
3347 self.report_uploader(uploader)
3350 # Get the first line for title
3352 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3353 mobj = re.search(pattern, webpage)
3355 video_title = mobj.group(1)
3356 self.report_title(video_title)
3358 # Step 2, Stimulate clicking the image box to launch video
3359 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3360 mobj = re.search(pattern, webpage)
3362 self._downloader.report_error(u'unable to extract video page URL')
3364 video_page = mobj.group(1)
3365 request = compat_urllib_request.Request(video_page)
3367 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3368 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3369 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3371 self.report_extract_vid_page(video_page)
3374 # Extract video links on video page
3375 """Extract video links of all sizes"""
3376 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3377 mobj = re.findall(pattern, webpage)
3379 self._downloader.report_error(u'unable to extract video links')
3381 # Sort in resolution
3382 links = sorted(mobj)
3384 # Choose the lowest of the sort, i.e. highest resolution
3385 video_url = links[-1]
3386 # Only get the url. The resolution part in the tuple has no use anymore
3387 video_url = video_url[-1]
3388 # Treat escaped \u0026 style hex
3390 video_url = video_url.decode("unicode_escape")
3391 except AttributeError: # Python 3
3392 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3398 'uploader': uploader,
3399 'upload_date': upload_date,
3400 'title': video_title,
3401 'ext': video_extension,
3404 class NBAIE(InfoExtractor):
3405 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3408 def _real_extract(self, url):
3409 mobj = re.match(self._VALID_URL, url)
3411 self._downloader.report_error(u'invalid URL: %s' % url)
3414 video_id = mobj.group(1)
3415 if video_id.endswith('/index.html'):
3416 video_id = video_id[:-len('/index.html')]
3418 webpage = self._download_webpage(url, video_id)
3420 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3421 def _findProp(rexp, default=None):
3422 m = re.search(rexp, webpage)
3424 return unescapeHTML(m.group(1))
3428 shortened_video_id = video_id.rpartition('/')[2]
3429 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3431 'id': shortened_video_id,
3435 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3436 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3440 class JustinTVIE(InfoExtractor):
3441 """Information extractor for justin.tv and twitch.tv"""
3442 # TODO: One broadcast may be split into multiple videos. The key
3443 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3444 # starts at 1 and increases. Can we treat all parts as one video?
3446 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3447 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3448 _JUSTIN_PAGE_LIMIT = 100
3449 IE_NAME = u'justin.tv'
3451 def report_download_page(self, channel, offset):
3452 """Report attempt to download a single page of videos."""
3453 self.to_screen(u'%s: Downloading video information from %d to %d' %
3454 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3456 # Return count of items, list of *valid* items
3457 def _parse_page(self, url):
3459 urlh = compat_urllib_request.urlopen(url)
3460 webpage_bytes = urlh.read()
3461 webpage = webpage_bytes.decode('utf-8', 'ignore')
3462 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3463 self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3466 response = json.loads(webpage)
3467 if type(response) != list:
3468 error_text = response.get('error', 'unknown error')
3469 self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3472 for clip in response:
3473 video_url = clip['video_file_url']
3475 video_extension = os.path.splitext(video_url)[1][1:]
3476 video_date = re.sub('-', '', clip['start_time'][:10])
3477 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3478 video_id = clip['id']
3479 video_title = clip.get('title', video_id)
3483 'title': video_title,
3484 'uploader': clip.get('channel_name', video_uploader_id),
3485 'uploader_id': video_uploader_id,
3486 'upload_date': video_date,
3487 'ext': video_extension,
3489 return (len(response), info)
3491 def _real_extract(self, url):
3492 mobj = re.match(self._VALID_URL, url)
3494 self._downloader.report_error(u'invalid URL: %s' % url)
3497 api = 'http://api.justin.tv'
3498 video_id = mobj.group(mobj.lastindex)
3500 if mobj.lastindex == 1:
3502 api += '/channel/archives/%s.json'
3504 api += '/broadcast/by_archive/%s.json'
3505 api = api % (video_id,)
3507 self.report_extraction(video_id)
3511 limit = self._JUSTIN_PAGE_LIMIT
3514 self.report_download_page(video_id, offset)
3515 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3516 page_count, page_info = self._parse_page(page_url)
3517 info.extend(page_info)
3518 if not paged or page_count != limit:
3523 class FunnyOrDieIE(InfoExtractor):
3524 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3526 def _real_extract(self, url):
3527 mobj = re.match(self._VALID_URL, url)
3529 self._downloader.report_error(u'invalid URL: %s' % url)
3532 video_id = mobj.group('id')
3533 webpage = self._download_webpage(url, video_id)
3535 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3537 self._downloader.report_error(u'unable to find video information')
3538 video_url = unescapeHTML(m.group('url'))
3540 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3542 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3544 self._downloader.report_error(u'Cannot find video title')
3545 title = clean_html(m.group('title'))
3547 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3549 desc = unescapeHTML(m.group('desc'))
3558 'description': desc,
3562 class SteamIE(InfoExtractor):
3563 _VALID_URL = r"""http://store.steampowered.com/
3564 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3566 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3570 def suitable(cls, url):
3571 """Receives a URL and returns True if suitable for this IE."""
3572 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3574 def _real_extract(self, url):
3575 m = re.match(self._VALID_URL, url, re.VERBOSE)
3576 gameID = m.group('gameID')
3577 videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3578 self.report_age_confirmation()
3579 webpage = self._download_webpage(videourl, gameID)
3580 game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3582 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3583 mweb = re.finditer(urlRE, webpage)
3584 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3585 titles = re.finditer(namesRE, webpage)
3586 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3587 thumbs = re.finditer(thumbsRE, webpage)
3589 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3590 video_id = vid.group('videoID')
3591 title = vtitle.group('videoName')
3592 video_url = vid.group('videoURL')
3593 video_thumb = thumb.group('thumbnail')
3595 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3600 'title': unescapeHTML(title),
3601 'thumbnail': video_thumb
3604 return [self.playlist_result(videos, gameID, game_title)]
3606 class UstreamIE(InfoExtractor):
3607 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3608 IE_NAME = u'ustream'
3610 def _real_extract(self, url):
3611 m = re.match(self._VALID_URL, url)
3612 video_id = m.group('videoID')
3613 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3614 webpage = self._download_webpage(url, video_id)
3615 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3616 title = m.group('title')
3617 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3618 uploader = m.group('uploader')
3624 'uploader': uploader
3628 class WorldStarHipHopIE(InfoExtractor):
3629 _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3630 IE_NAME = u'WorldStarHipHop'
3632 def _real_extract(self, url):
3633 _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3635 webpage_src = compat_urllib_request.urlopen(url).read()
3636 webpage_src = webpage_src.decode('utf-8')
3638 mobj = re.search(_src_url, webpage_src)
3640 m = re.match(self._VALID_URL, url)
3641 video_id = m.group('id')
3643 if mobj is not None:
3644 video_url = mobj.group()
3645 if 'mp4' in video_url:
3650 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3653 _title = r"""<title>(.*)</title>"""
3655 mobj = re.search(_title, webpage_src)
3657 if mobj is not None:
3658 title = mobj.group(1)
3660 title = 'World Start Hip Hop - %s' % time.ctime()
3662 _thumbnail = r"""rel="image_src" href="(.*)" />"""
3663 mobj = re.search(_thumbnail, webpage_src)
3665 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3666 if mobj is not None:
3667 thumbnail = mobj.group(1)
3669 _title = r"""candytitles.*>(.*)</span>"""
3670 mobj = re.search(_title, webpage_src)
3671 if mobj is not None:
3672 title = mobj.group(1)
3679 'thumbnail' : thumbnail,
3684 class RBMARadioIE(InfoExtractor):
3685 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3687 def _real_extract(self, url):
3688 m = re.match(self._VALID_URL, url)
3689 video_id = m.group('videoID')
3691 webpage = self._download_webpage(url, video_id)
3692 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3694 raise ExtractorError(u'Cannot find metadata')
3695 json_data = m.group(1)
3698 data = json.loads(json_data)
3699 except ValueError as e:
3700 raise ExtractorError(u'Invalid JSON: ' + str(e))
3702 video_url = data['akamai_url'] + '&cbr=256'
3703 url_parts = compat_urllib_parse_urlparse(video_url)
3704 video_ext = url_parts.path.rpartition('.')[2]
3709 'title': data['title'],
3710 'description': data.get('teaser_text'),
3711 'location': data.get('country_of_origin'),
3712 'uploader': data.get('host', {}).get('name'),
3713 'uploader_id': data.get('host', {}).get('slug'),
3714 'thumbnail': data.get('image', {}).get('large_url_2x'),
3715 'duration': data.get('duration'),
3720 class YouPornIE(InfoExtractor):
3721 """Information extractor for youporn.com."""
3722 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3724 def _print_formats(self, formats):
3725 """Print all available formats"""
3726 print(u'Available formats:')
3727 print(u'ext\t\tformat')
3728 print(u'---------------------------------')
3729 for format in formats:
3730 print(u'%s\t\t%s' % (format['ext'], format['format']))
3732 def _specific(self, req_format, formats):
3734 if(x["format"]==req_format):
3738 def _real_extract(self, url):
3739 mobj = re.match(self._VALID_URL, url)
3741 self._downloader.report_error(u'invalid URL: %s' % url)
3744 video_id = mobj.group('videoid')
3746 req = compat_urllib_request.Request(url)
3747 req.add_header('Cookie', 'age_verified=1')
3748 webpage = self._download_webpage(req, video_id)
3750 # Get the video title
3751 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3753 raise ExtractorError(u'Unable to extract video title')
3754 video_title = result.group('title').strip()
3756 # Get the video date
3757 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3759 self._downloader.report_warning(u'unable to extract video date')
3762 upload_date = result.group('date').strip()
3764 # Get the video uploader
3765 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3767 self._downloader.report_warning(u'unable to extract uploader')
3768 video_uploader = None
3770 video_uploader = result.group('uploader').strip()
3771 video_uploader = clean_html( video_uploader )
3773 # Get all of the formats available
3774 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3775 result = re.search(DOWNLOAD_LIST_RE, webpage)
3777 raise ExtractorError(u'Unable to extract download list')
3778 download_list_html = result.group('download_list').strip()
3780 # Get all of the links from the page
3781 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3782 links = re.findall(LINK_RE, download_list_html)
3783 if(len(links) == 0):
3784 raise ExtractorError(u'ERROR: no known formats available for video')
3786 self.to_screen(u'Links found: %d' % len(links))
3791 # A link looks like this:
3792 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3793 # A path looks like this:
3794 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3795 video_url = unescapeHTML( link )
3796 path = compat_urllib_parse_urlparse( video_url ).path
3797 extension = os.path.splitext( path )[1][1:]
3798 format = path.split('/')[4].split('_')[:2]
3801 format = "-".join( format )
3802 title = u'%s-%s-%s' % (video_title, size, bitrate)
3807 'uploader': video_uploader,
3808 'upload_date': upload_date,
3813 'description': None,
3817 if self._downloader.params.get('listformats', None):
3818 self._print_formats(formats)
3821 req_format = self._downloader.params.get('format', None)
3822 self.to_screen(u'Format: %s' % req_format)
3824 if req_format is None or req_format == 'best':
3826 elif req_format == 'worst':
3827 return [formats[-1]]
3828 elif req_format in ('-1', 'all'):
3831 format = self._specific( req_format, formats )
3833 self._downloader.report_error(u'requested format not available')
3839 class PornotubeIE(InfoExtractor):
3840 """Information extractor for pornotube.com."""
3841 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3843 def _real_extract(self, url):
3844 mobj = re.match(self._VALID_URL, url)
3846 self._downloader.report_error(u'invalid URL: %s' % url)
3849 video_id = mobj.group('videoid')
3850 video_title = mobj.group('title')
3852 # Get webpage content
3853 webpage = self._download_webpage(url, video_id)
3856 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3857 result = re.search(VIDEO_URL_RE, webpage)
3859 self._downloader.report_error(u'unable to extract video url')
3861 video_url = compat_urllib_parse.unquote(result.group('url'))
3863 #Get the uploaded date
3864 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3865 result = re.search(VIDEO_UPLOADED_RE, webpage)
3867 self._downloader.report_error(u'unable to extract video title')
3869 upload_date = result.group('date')
3871 info = {'id': video_id,
3874 'upload_date': upload_date,
3875 'title': video_title,
3881 class YouJizzIE(InfoExtractor):
3882 """Information extractor for youjizz.com."""
3883 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3885 def _real_extract(self, url):
3886 mobj = re.match(self._VALID_URL, url)
3888 self._downloader.report_error(u'invalid URL: %s' % url)
3891 video_id = mobj.group('videoid')
3893 # Get webpage content
3894 webpage = self._download_webpage(url, video_id)
3896 # Get the video title
3897 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3899 raise ExtractorError(u'ERROR: unable to extract video title')
3900 video_title = result.group('title').strip()
3902 # Get the embed page
3903 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3905 raise ExtractorError(u'ERROR: unable to extract embed page')
3907 embed_page_url = result.group(0).strip()
3908 video_id = result.group('videoid')
3910 webpage = self._download_webpage(embed_page_url, video_id)
3913 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3915 raise ExtractorError(u'ERROR: unable to extract video url')
3916 video_url = result.group('source')
3918 info = {'id': video_id,
3920 'title': video_title,
3923 'player_url': embed_page_url}
3927 class EightTracksIE(InfoExtractor):
3929 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3931 def _real_extract(self, url):
3932 mobj = re.match(self._VALID_URL, url)
3934 raise ExtractorError(u'Invalid URL: %s' % url)
3935 playlist_id = mobj.group('id')
3937 webpage = self._download_webpage(url, playlist_id)
3939 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3941 raise ExtractorError(u'Cannot find trax information')
3942 json_like = m.group(1)
3943 data = json.loads(json_like)
3945 session = str(random.randint(0, 1000000000))
3947 track_count = data['tracks_count']
3948 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3949 next_url = first_url
3951 for i in itertools.count():
3952 api_json = self._download_webpage(next_url, playlist_id,
3953 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3954 errnote=u'Failed to download song information')
3955 api_data = json.loads(api_json)
3956 track_data = api_data[u'set']['track']
3958 'id': track_data['id'],
3959 'url': track_data['track_file_stream_url'],
3960 'title': track_data['performer'] + u' - ' + track_data['name'],
3961 'raw_title': track_data['name'],
3962 'uploader_id': data['user']['login'],
3966 if api_data['set']['at_last_track']:
3968 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3971 class KeekIE(InfoExtractor):
3972 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3975 def _real_extract(self, url):
3976 m = re.match(self._VALID_URL, url)
3977 video_id = m.group('videoID')
3978 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3979 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3980 webpage = self._download_webpage(url, video_id)
3981 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3982 title = unescapeHTML(m.group('title'))
3983 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3984 uploader = clean_html(m.group('uploader'))
3990 'thumbnail': thumbnail,
3991 'uploader': uploader
3995 class TEDIE(InfoExtractor):
3996 _VALID_URL=r'''http://www.ted.com/
3998 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4000 ((?P<type_talk>talks)) # We have a simple talk
4002 /(?P<name>\w+) # Here goes the name and then ".html"
4006 def suitable(cls, url):
4007 """Receives a URL and returns True if suitable for this IE."""
4008 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4010 def _real_extract(self, url):
4011 m=re.match(self._VALID_URL, url, re.VERBOSE)
4012 if m.group('type_talk'):
4013 return [self._talk_info(url)]
4015 playlist_id=m.group('playlist_id')
4016 name=m.group('name')
4017 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
4018 return [self._playlist_videos_info(url,name,playlist_id)]
4020 def _talk_video_link(self,mediaSlug):
4021 '''Returns the video link for that mediaSlug'''
4022 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4024 def _playlist_videos_info(self,url,name,playlist_id=0):
4025 '''Returns the videos of the playlist'''
4027 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4028 ([.\s]*?)data-playlist_item_id="(\d+)"
4029 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4031 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4032 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4033 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4034 m_names=re.finditer(video_name_RE,webpage)
4036 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
4037 m_playlist = re.search(playlist_RE, webpage)
4038 playlist_title = m_playlist.group('playlist_title')
4040 playlist_entries = []
4041 for m_video, m_name in zip(m_videos,m_names):
4042 video_id=m_video.group('video_id')
4043 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4044 playlist_entries.append(self.url_result(talk_url, 'TED'))
4045 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
4047 def _talk_info(self, url, video_id=0):
4048 """Return the video for the talk in the url"""
4049 m=re.match(self._VALID_URL, url,re.VERBOSE)
4050 videoName=m.group('name')
4051 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4052 # If the url includes the language we get the title translated
4053 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4054 title=re.search(title_RE, webpage).group('title')
4055 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4056 "id":(?P<videoID>[\d]+).*?
4057 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4058 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4059 thumb_match=re.search(thumb_RE,webpage)
4060 info_match=re.search(info_RE,webpage,re.VERBOSE)
4061 video_id=info_match.group('videoID')
4062 mediaSlug=info_match.group('mediaSlug')
4063 video_url=self._talk_video_link(mediaSlug)
4069 'thumbnail': thumb_match.group('thumbnail')
4073 class MySpassIE(InfoExtractor):
4074 _VALID_URL = r'http://www.myspass.de/.*'
4076 def _real_extract(self, url):
4077 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4079 # video id is the last path element of the URL
4080 # usually there is a trailing slash, so also try the second but last
4081 url_path = compat_urllib_parse_urlparse(url).path
4082 url_parent_path, video_id = os.path.split(url_path)
4084 _, video_id = os.path.split(url_parent_path)
4087 metadata_url = META_DATA_URL_TEMPLATE % video_id
4088 metadata_text = self._download_webpage(metadata_url, video_id)
4089 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4091 # extract values from metadata
4092 url_flv_el = metadata.find('url_flv')
4093 if url_flv_el is None:
4094 self._downloader.report_error(u'unable to extract download url')
4096 video_url = url_flv_el.text
4097 extension = os.path.splitext(video_url)[1][1:]
4098 title_el = metadata.find('title')
4099 if title_el is None:
4100 self._downloader.report_error(u'unable to extract title')
4102 title = title_el.text
4103 format_id_el = metadata.find('format_id')
4104 if format_id_el is None:
4107 format = format_id_el.text
4108 description_el = metadata.find('description')
4109 if description_el is not None:
4110 description = description_el.text
4113 imagePreview_el = metadata.find('imagePreview')
4114 if imagePreview_el is not None:
4115 thumbnail = imagePreview_el.text
4124 'thumbnail': thumbnail,
4125 'description': description
4129 class SpiegelIE(InfoExtractor):
4130 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4132 def _real_extract(self, url):
4133 m = re.match(self._VALID_URL, url)
4134 video_id = m.group('videoID')
4136 webpage = self._download_webpage(url, video_id)
4137 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4139 raise ExtractorError(u'Cannot find title')
4140 video_title = unescapeHTML(m.group(1))
4142 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4143 xml_code = self._download_webpage(xml_url, video_id,
4144 note=u'Downloading XML', errnote=u'Failed to download XML')
4146 idoc = xml.etree.ElementTree.fromstring(xml_code)
4147 last_type = idoc[-1]
4148 filename = last_type.findall('./filename')[0].text
4149 duration = float(last_type.findall('./duration')[0].text)
4151 video_url = 'http://video2.spiegel.de/flash/' + filename
4152 video_ext = filename.rpartition('.')[2]
4157 'title': video_title,
4158 'duration': duration,
4162 class LiveLeakIE(InfoExtractor):
4164 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4165 IE_NAME = u'liveleak'
4167 def _real_extract(self, url):
4168 mobj = re.match(self._VALID_URL, url)
4170 self._downloader.report_error(u'invalid URL: %s' % url)
4173 video_id = mobj.group('video_id')
4175 webpage = self._download_webpage(url, video_id)
4177 m = re.search(r'file: "(.*?)",', webpage)
4179 self._downloader.report_error(u'unable to find video url')
4181 video_url = m.group(1)
4183 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4185 self._downloader.report_error(u'Cannot find video title')
4186 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4188 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4190 desc = unescapeHTML(m.group('desc'))
4194 m = re.search(r'By:.*?(\w+)</a>', webpage)
4196 uploader = clean_html(m.group(1))
4205 'description': desc,
4206 'uploader': uploader
4211 class ARDIE(InfoExtractor):
4212 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4213 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4214 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4216 def _real_extract(self, url):
4217 # determine video id from url
4218 m = re.match(self._VALID_URL, url)
4220 numid = re.search(r'documentId=([0-9]+)', url)
4222 video_id = numid.group(1)
4224 video_id = m.group('video_id')
4226 # determine title and media streams from webpage
4227 html = self._download_webpage(url, video_id)
4228 title = re.search(self._TITLE, html).group('title')
4229 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4231 assert '"fsk"' in html
4232 self._downloader.report_error(u'this video is only available after 8:00 pm')
4235 # choose default media type and highest quality for now
4236 stream = max([s for s in streams if int(s["media_type"]) == 0],
4237 key=lambda s: int(s["quality"]))
4239 # there's two possibilities: RTMP stream or HTTP download
4240 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4241 if stream['rtmp_url']:
4242 self.to_screen(u'RTMP download detected')
4243 assert stream['video_url'].startswith('mp4:')
4244 info["url"] = stream["rtmp_url"]
4245 info["play_path"] = stream['video_url']
4247 assert stream["video_url"].endswith('.mp4')
4248 info["url"] = stream["video_url"]
4251 class TumblrIE(InfoExtractor):
4252 _VALID_URL = r'http://(?P<blog_name>.*?).tumblr.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4254 def _real_extract(self, url):
4255 m_url = re.match(self._VALID_URL, url)
4256 video_id = m_url.group('id')
4257 blog = m_url.group('blog_name')
4259 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4260 webpage = self._download_webpage(url, video_id)
4262 re_video = r'src=\\x22(?P<video_url>http://%s.tumblr.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4263 video = re.search(re_video, webpage)
4265 self.to_screen("No video founded")
4267 video_url = video.group('video_url')
4268 ext = video.group('ext')
4270 re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22' # We pick the first poster
4271 thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4273 # The only place where you can get a title, it's not complete,
4274 # but searching in other places doesn't work for all videos
4275 re_title = r'<title>(.*?) - (?P<title>.*?)</title>'
4276 title = unescapeHTML(re.search(re_title, webpage).group('title'))
4278 return [{'id': video_id,
4286 def gen_extractors():
4287 """ Return a list of an instance of every supported extractor.
4288 The order does matter; the first extractor matched is the one handling the URL.
4291 YoutubePlaylistIE(),
4316 StanfordOpenClassroomIE(),
4326 WorldStarHipHopIE(),
4343 def get_info_extractor(ie_name):
4344 """Returns the info extractor class with the given ie_name"""
4345 return globals()[ie_name+'IE']