2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 self.report_download_webpage(video_id)
118 elif note is not False:
119 self.to_screen(u'%s: %s' % (video_id, note))
121 return compat_urllib_request.urlopen(url_or_request)
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
124 errnote = u'Unable to download webpage'
125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
127 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns the data of the page as a string """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
133 encoding = m.group(1)
136 webpage_bytes = urlh.read()
137 if self._downloader.params.get('dump_intermediate_pages', False):
139 url = url_or_request.get_full_url()
140 except AttributeError:
142 self.to_screen(u'Dumping request to ' + url)
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
145 return webpage_bytes.decode(encoding, 'replace')
147 def to_screen(self, msg):
148 """Print msg to screen, prefixing it with '[ie_name]'"""
149 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
151 def report_extraction(self, id_or_name):
152 """Report information extraction."""
153 self.to_screen(u'%s: Extracting information' % id_or_name)
155 def report_download_webpage(self, video_id):
156 """Report webpage download."""
157 self.to_screen(u'%s: Downloading webpage' % video_id)
159 def report_age_confirmation(self):
160 """Report attempt to confirm age."""
161 self.to_screen(u'Confirming age')
163 #Methods for following #608
164 #They set the correct value of the '_type' key
165 def video_result(self, video_info):
166 """Returns a video"""
167 video_info['_type'] = 'video'
169 def url_result(self, url, ie=None):
170 """Returns a url that points to a page that should be processed"""
171 #TODO: ie should be the class used for getting the info
172 video_info = {'_type': 'url',
176 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
177 """Returns a playlist"""
178 video_info = {'_type': 'playlist',
181 video_info['id'] = playlist_id
183 video_info['title'] = playlist_title
187 class YoutubeIE(InfoExtractor):
188 """Information extractor for youtube.com."""
192 (?:https?://)? # http(s):// (optional)
193 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
194 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
195 (?:.*?\#/)? # handle anchor (#/) redirect urls
196 (?: # the various things that can precede the ID:
197 (?:(?:v|embed|e)/) # v/ or embed/ or e/
198 |(?: # or the v= param in all its forms
199 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
200 (?:\?|\#!?) # the params delimiter ? or # or #!
201 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
204 )? # optional -> youtube.com/xxxx is OK
205 )? # all until now is optional -> you can pass the naked ID
206 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
207 (?(1).+)? # if we found the ID, everything can follow
209 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
210 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
211 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
212 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
213 _NETRC_MACHINE = 'youtube'
214 # Listed in order of quality
215 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
216 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
217 _video_extensions = {
223 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
229 _video_dimensions = {
248 def suitable(cls, url):
249 """Receives a URL and returns True if suitable for this IE."""
250 if YoutubePlaylistIE.suitable(url): return False
251 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
253 def report_lang(self):
254 """Report attempt to set language."""
255 self.to_screen(u'Setting language')
257 def report_login(self):
258 """Report attempt to log in."""
259 self.to_screen(u'Logging in')
261 def report_video_webpage_download(self, video_id):
262 """Report attempt to download video webpage."""
263 self.to_screen(u'%s: Downloading video webpage' % video_id)
265 def report_video_info_webpage_download(self, video_id):
266 """Report attempt to download video info webpage."""
267 self.to_screen(u'%s: Downloading video info webpage' % video_id)
269 def report_video_subtitles_download(self, video_id):
270 """Report attempt to download video info webpage."""
271 self.to_screen(u'%s: Checking available subtitles' % video_id)
273 def report_video_subtitles_request(self, video_id, sub_lang, format):
274 """Report attempt to download video info webpage."""
275 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
277 def report_video_subtitles_available(self, video_id, sub_lang_list):
278 """Report available subtitles."""
279 sub_lang = ",".join(list(sub_lang_list.keys()))
280 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
282 def report_information_extraction(self, video_id):
283 """Report attempt to extract video information."""
284 self.to_screen(u'%s: Extracting video information' % video_id)
286 def report_unavailable_format(self, video_id, format):
287 """Report extracted video URL."""
288 self.to_screen(u'%s: Format %s not available' % (video_id, format))
290 def report_rtmp_download(self):
291 """Indicate the download will use the RTMP protocol."""
292 self.to_screen(u'RTMP download detected')
294 def _get_available_subtitles(self, video_id):
295 self.report_video_subtitles_download(video_id)
296 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
298 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
299 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
300 return (u'unable to download video subtitles: %s' % compat_str(err), None)
301 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
302 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
303 if not sub_lang_list:
304 return (u'video doesn\'t have subtitles', None)
307 def _list_available_subtitles(self, video_id):
308 sub_lang_list = self._get_available_subtitles(video_id)
309 self.report_video_subtitles_available(video_id, sub_lang_list)
311 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
314 (error_message, sub_lang, sub)
316 self.report_video_subtitles_request(video_id, sub_lang, format)
317 params = compat_urllib_parse.urlencode({
323 url = 'http://www.youtube.com/api/timedtext?' + params
325 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
326 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
327 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
329 return (u'Did not fetch video subtitles', None, None)
330 return (None, sub_lang, sub)
332 def _extract_subtitle(self, video_id):
334 Return a list with a tuple:
335 [(error_message, sub_lang, sub)]
337 sub_lang_list = self._get_available_subtitles(video_id)
338 sub_format = self._downloader.params.get('subtitlesformat')
339 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
340 return [(sub_lang_list[0], None, None)]
341 if self._downloader.params.get('subtitleslang', False):
342 sub_lang = self._downloader.params.get('subtitleslang')
343 elif 'en' in sub_lang_list:
346 sub_lang = list(sub_lang_list.keys())[0]
347 if not sub_lang in sub_lang_list:
348 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
350 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
353 def _extract_all_subtitles(self, video_id):
354 sub_lang_list = self._get_available_subtitles(video_id)
355 sub_format = self._downloader.params.get('subtitlesformat')
356 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
357 return [(sub_lang_list[0], None, None)]
359 for sub_lang in sub_lang_list:
360 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
361 subtitles.append(subtitle)
364 def _print_formats(self, formats):
365 print('Available formats:')
367 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
369 def _real_initialize(self):
370 if self._downloader is None:
375 downloader_params = self._downloader.params
377 # Attempt to use provided username and password or .netrc data
378 if downloader_params.get('username', None) is not None:
379 username = downloader_params['username']
380 password = downloader_params['password']
381 elif downloader_params.get('usenetrc', False):
383 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
388 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
389 except (IOError, netrc.NetrcParseError) as err:
390 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
394 request = compat_urllib_request.Request(self._LANG_URL)
397 compat_urllib_request.urlopen(request).read()
398 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
399 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
402 # No authentication to be performed
406 request = compat_urllib_request.Request(self._LOGIN_URL)
408 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
409 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
410 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
415 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
417 galx = match.group(1)
419 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
425 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
429 u'PersistentCookie': u'yes',
431 u'bgresponse': u'js_disabled',
432 u'checkConnection': u'',
433 u'checkedDomains': u'youtube',
439 u'signIn': u'Sign in',
441 u'service': u'youtube',
445 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
447 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
448 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
449 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
452 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
453 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
454 self._downloader.report_warning(u'unable to log in: bad username or password')
456 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
457 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
463 'action_confirm': 'Confirm',
465 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
467 self.report_age_confirmation()
468 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
469 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
470 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
473 def _extract_id(self, url):
474 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
476 self._downloader.report_error(u'invalid URL: %s' % url)
478 video_id = mobj.group(2)
481 def _real_extract(self, url):
482 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
483 mobj = re.search(self._NEXT_URL_RE, url)
485 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
486 video_id = self._extract_id(url)
489 self.report_video_webpage_download(video_id)
490 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
491 request = compat_urllib_request.Request(url)
493 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
494 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
495 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
498 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
500 # Attempt to extract SWF player URL
501 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
503 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
508 self.report_video_info_webpage_download(video_id)
509 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
510 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
511 % (video_id, el_type))
512 video_info_webpage = self._download_webpage(video_info_url, video_id,
514 errnote='unable to download video info webpage')
515 video_info = compat_parse_qs(video_info_webpage)
516 if 'token' in video_info:
518 if 'token' not in video_info:
519 if 'reason' in video_info:
520 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
522 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
525 # Check for "rental" videos
526 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
527 self._downloader.report_error(u'"rental" videos not supported')
530 # Start extracting information
531 self.report_information_extraction(video_id)
534 if 'author' not in video_info:
535 self._downloader.report_error(u'unable to extract uploader name')
537 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
540 video_uploader_id = None
541 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
543 video_uploader_id = mobj.group(1)
545 self._downloader.report_warning(u'unable to extract uploader nickname')
548 if 'title' not in video_info:
549 self._downloader.report_error(u'unable to extract video title')
551 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
554 if 'thumbnail_url' not in video_info:
555 self._downloader.report_warning(u'unable to extract video thumbnail')
557 else: # don't panic if we can't find it
558 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
562 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
564 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
565 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
566 for expression in format_expressions:
568 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
573 video_description = get_element_by_id("eow-description", video_webpage)
574 if video_description:
575 video_description = clean_html(video_description)
577 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
579 video_description = unescapeHTML(fd_mobj.group(1))
581 video_description = u''
584 video_subtitles = None
586 if self._downloader.params.get('writesubtitles', False):
587 video_subtitles = self._extract_subtitle(video_id)
589 (sub_error, sub_lang, sub) = video_subtitles[0]
591 self._downloader.report_error(sub_error)
593 if self._downloader.params.get('allsubtitles', False):
594 video_subtitles = self._extract_all_subtitles(video_id)
595 for video_subtitle in video_subtitles:
596 (sub_error, sub_lang, sub) = video_subtitle
598 self._downloader.report_error(sub_error)
600 if self._downloader.params.get('listsubtitles', False):
601 sub_lang_list = self._list_available_subtitles(video_id)
604 if 'length_seconds' not in video_info:
605 self._downloader.report_warning(u'unable to extract video duration')
608 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
611 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
613 # Decide which formats to download
614 req_format = self._downloader.params.get('format', None)
616 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
617 self.report_rtmp_download()
618 video_url_list = [(None, video_info['conn'][0])]
619 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
620 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
621 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
622 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
623 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
625 format_limit = self._downloader.params.get('format_limit', None)
626 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
627 if format_limit is not None and format_limit in available_formats:
628 format_list = available_formats[available_formats.index(format_limit):]
630 format_list = available_formats
631 existing_formats = [x for x in format_list if x in url_map]
632 if len(existing_formats) == 0:
633 raise ExtractorError(u'no known formats available for video')
634 if self._downloader.params.get('listformats', None):
635 self._print_formats(existing_formats)
637 if req_format is None or req_format == 'best':
638 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
639 elif req_format == 'worst':
640 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
641 elif req_format in ('-1', 'all'):
642 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
644 # Specific formats. We pick the first in a slash-delimeted sequence.
645 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
646 req_formats = req_format.split('/')
647 video_url_list = None
648 for rf in req_formats:
650 video_url_list = [(rf, url_map[rf])]
652 if video_url_list is None:
653 raise ExtractorError(u'requested format not available')
655 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
658 for format_param, video_real_url in video_url_list:
660 video_extension = self._video_extensions.get(format_param, 'flv')
662 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
663 self._video_dimensions.get(format_param, '???'))
667 'url': video_real_url,
668 'uploader': video_uploader,
669 'uploader_id': video_uploader_id,
670 'upload_date': upload_date,
671 'title': video_title,
672 'ext': video_extension,
673 'format': video_format,
674 'thumbnail': video_thumbnail,
675 'description': video_description,
676 'player_url': player_url,
677 'subtitles': video_subtitles,
678 'duration': video_duration
683 class MetacafeIE(InfoExtractor):
684 """Information Extractor for metacafe.com."""
686 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
687 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
688 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
689 IE_NAME = u'metacafe'
691 def __init__(self, downloader=None):
692 InfoExtractor.__init__(self, downloader)
694 def report_disclaimer(self):
695 """Report disclaimer retrieval."""
696 self.to_screen(u'Retrieving disclaimer')
698 def _real_initialize(self):
699 # Retrieve disclaimer
700 request = compat_urllib_request.Request(self._DISCLAIMER)
702 self.report_disclaimer()
703 disclaimer = compat_urllib_request.urlopen(request).read()
704 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
705 self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
711 'submit': "Continue - I'm over 18",
713 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
715 self.report_age_confirmation()
716 disclaimer = compat_urllib_request.urlopen(request).read()
717 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
718 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
721 def _real_extract(self, url):
722 # Extract id and simplified title from URL
723 mobj = re.match(self._VALID_URL, url)
725 self._downloader.report_error(u'invalid URL: %s' % url)
728 video_id = mobj.group(1)
730 # Check if video comes from YouTube
731 mobj2 = re.match(r'^yt-(.*)$', video_id)
732 if mobj2 is not None:
733 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
735 # Retrieve video webpage to extract further information
736 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
738 # Extract URL, uploader and title from webpage
739 self.report_extraction(video_id)
740 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
742 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
743 video_extension = mediaURL[-3:]
745 # Extract gdaKey if available
746 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
750 gdaKey = mobj.group(1)
751 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
753 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
755 self._downloader.report_error(u'unable to extract media URL')
757 vardict = compat_parse_qs(mobj.group(1))
758 if 'mediaData' not in vardict:
759 self._downloader.report_error(u'unable to extract media URL')
761 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
763 self._downloader.report_error(u'unable to extract media URL')
765 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
766 video_extension = mediaURL[-3:]
767 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
769 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
771 self._downloader.report_error(u'unable to extract title')
773 video_title = mobj.group(1).decode('utf-8')
775 mobj = re.search(r'submitter=(.*?);', webpage)
777 self._downloader.report_error(u'unable to extract uploader nickname')
779 video_uploader = mobj.group(1)
782 'id': video_id.decode('utf-8'),
783 'url': video_url.decode('utf-8'),
784 'uploader': video_uploader.decode('utf-8'),
786 'title': video_title,
787 'ext': video_extension.decode('utf-8'),
791 class DailymotionIE(InfoExtractor):
792 """Information Extractor for Dailymotion"""
794 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
795 IE_NAME = u'dailymotion'
798 def __init__(self, downloader=None):
799 InfoExtractor.__init__(self, downloader)
801 def _real_extract(self, url):
802 # Extract id and simplified title from URL
803 mobj = re.match(self._VALID_URL, url)
805 self._downloader.report_error(u'invalid URL: %s' % url)
808 video_id = mobj.group(1).split('_')[0].split('?')[0]
810 video_extension = 'mp4'
812 # Retrieve video webpage to extract further information
813 request = compat_urllib_request.Request(url)
814 request.add_header('Cookie', 'family_filter=off')
815 webpage = self._download_webpage(request, video_id)
817 # Extract URL, uploader and title from webpage
818 self.report_extraction(video_id)
819 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
821 self._downloader.report_error(u'unable to extract media URL')
823 flashvars = compat_urllib_parse.unquote(mobj.group(1))
825 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
828 self.to_screen(u'Using %s' % key)
831 self._downloader.report_error(u'unable to extract video URL')
834 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
836 self._downloader.report_error(u'unable to extract video URL')
839 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
841 # TODO: support choosing qualities
843 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
845 self._downloader.report_error(u'unable to extract title')
847 video_title = unescapeHTML(mobj.group('title'))
849 video_uploader = None
850 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
852 # lookin for official user
853 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
854 if mobj_official is None:
855 self._downloader.report_warning(u'unable to extract uploader nickname')
857 video_uploader = mobj_official.group(1)
859 video_uploader = mobj.group(1)
861 video_upload_date = None
862 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
864 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
869 'uploader': video_uploader,
870 'upload_date': video_upload_date,
871 'title': video_title,
872 'ext': video_extension,
876 class PhotobucketIE(InfoExtractor):
877 """Information extractor for photobucket.com."""
879 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
880 IE_NAME = u'photobucket'
882 def __init__(self, downloader=None):
883 InfoExtractor.__init__(self, downloader)
885 def _real_extract(self, url):
886 # Extract id from URL
887 mobj = re.match(self._VALID_URL, url)
889 self._downloader.report_error(u'Invalid URL: %s' % url)
892 video_id = mobj.group(1)
894 video_extension = 'flv'
896 # Retrieve video webpage to extract further information
897 request = compat_urllib_request.Request(url)
899 self.report_download_webpage(video_id)
900 webpage = compat_urllib_request.urlopen(request).read()
901 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
902 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
905 # Extract URL, uploader, and title from webpage
906 self.report_extraction(video_id)
907 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
909 self._downloader.report_error(u'unable to extract media URL')
911 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
915 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
917 self._downloader.report_error(u'unable to extract title')
919 video_title = mobj.group(1).decode('utf-8')
921 video_uploader = mobj.group(2).decode('utf-8')
924 'id': video_id.decode('utf-8'),
925 'url': video_url.decode('utf-8'),
926 'uploader': video_uploader,
928 'title': video_title,
929 'ext': video_extension.decode('utf-8'),
933 class YahooIE(InfoExtractor):
934 """Information extractor for video.yahoo.com."""
937 # _VALID_URL matches all Yahoo! Video URLs
938 # _VPAGE_URL matches only the extractable '/watch/' URLs
939 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
940 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
941 IE_NAME = u'video.yahoo'
943 def __init__(self, downloader=None):
944 InfoExtractor.__init__(self, downloader)
946 def _real_extract(self, url, new_video=True):
947 # Extract ID from URL
948 mobj = re.match(self._VALID_URL, url)
950 self._downloader.report_error(u'Invalid URL: %s' % url)
953 video_id = mobj.group(2)
954 video_extension = 'flv'
956 # Rewrite valid but non-extractable URLs as
957 # extractable English language /watch/ URLs
958 if re.match(self._VPAGE_URL, url) is None:
959 request = compat_urllib_request.Request(url)
961 webpage = compat_urllib_request.urlopen(request).read()
962 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
963 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
966 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
968 self._downloader.report_error(u'Unable to extract id field')
970 yahoo_id = mobj.group(1)
972 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
974 self._downloader.report_error(u'Unable to extract vid field')
976 yahoo_vid = mobj.group(1)
978 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
979 return self._real_extract(url, new_video=False)
981 # Retrieve video webpage to extract further information
982 request = compat_urllib_request.Request(url)
984 self.report_download_webpage(video_id)
985 webpage = compat_urllib_request.urlopen(request).read()
986 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
987 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
990 # Extract uploader and title from webpage
991 self.report_extraction(video_id)
992 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
994 self._downloader.report_error(u'unable to extract video title')
996 video_title = mobj.group(1).decode('utf-8')
998 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1000 self._downloader.report_error(u'unable to extract video uploader')
1002 video_uploader = mobj.group(1).decode('utf-8')
1004 # Extract video thumbnail
1005 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1007 self._downloader.report_error(u'unable to extract video thumbnail')
1009 video_thumbnail = mobj.group(1).decode('utf-8')
1011 # Extract video description
1012 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1014 self._downloader.report_error(u'unable to extract video description')
1016 video_description = mobj.group(1).decode('utf-8')
1017 if not video_description:
1018 video_description = 'No description available.'
1020 # Extract video height and width
1021 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1023 self._downloader.report_error(u'unable to extract video height')
1025 yv_video_height = mobj.group(1)
1027 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1029 self._downloader.report_error(u'unable to extract video width')
1031 yv_video_width = mobj.group(1)
1033 # Retrieve video playlist to extract media URL
1034 # I'm not completely sure what all these options are, but we
1035 # seem to need most of them, otherwise the server sends a 401.
1036 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1037 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1038 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1039 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1040 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1042 self.report_download_webpage(video_id)
1043 webpage = compat_urllib_request.urlopen(request).read()
1044 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1045 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1048 # Extract media URL from playlist XML
1049 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1051 self._downloader.report_error(u'Unable to extract media URL')
1053 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1054 video_url = unescapeHTML(video_url)
1057 'id': video_id.decode('utf-8'),
1059 'uploader': video_uploader,
1060 'upload_date': None,
1061 'title': video_title,
1062 'ext': video_extension.decode('utf-8'),
1063 'thumbnail': video_thumbnail.decode('utf-8'),
1064 'description': video_description,
1068 class VimeoIE(InfoExtractor):
1069 """Information extractor for vimeo.com."""
1071 # _VALID_URL matches Vimeo URLs
1072 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1075 def __init__(self, downloader=None):
1076 InfoExtractor.__init__(self, downloader)
1078 def _real_extract(self, url, new_video=True):
1079 # Extract ID from URL
1080 mobj = re.match(self._VALID_URL, url)
1082 self._downloader.report_error(u'Invalid URL: %s' % url)
1085 video_id = mobj.group('id')
1086 if not mobj.group('proto'):
1087 url = 'https://' + url
1088 if mobj.group('direct_link'):
1089 url = 'https://vimeo.com/' + video_id
1091 # Retrieve video webpage to extract further information
1092 request = compat_urllib_request.Request(url, None, std_headers)
1094 self.report_download_webpage(video_id)
1095 webpage_bytes = compat_urllib_request.urlopen(request).read()
1096 webpage = webpage_bytes.decode('utf-8')
1097 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1098 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1101 # Now we begin extracting as much information as we can from what we
1102 # retrieved. First we extract the information common to all extractors,
1103 # and latter we extract those that are Vimeo specific.
1104 self.report_extraction(video_id)
1106 # Extract the config JSON
1108 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1109 config = json.loads(config)
1111 self._downloader.report_error(u'unable to extract info section')
1115 video_title = config["video"]["title"]
1117 # Extract uploader and uploader_id
1118 video_uploader = config["video"]["owner"]["name"]
1119 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1121 # Extract video thumbnail
1122 video_thumbnail = config["video"]["thumbnail"]
1124 # Extract video description
1125 video_description = get_element_by_attribute("itemprop", "description", webpage)
1126 if video_description: video_description = clean_html(video_description)
1127 else: video_description = u''
1129 # Extract upload date
1130 video_upload_date = None
1131 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1132 if mobj is not None:
1133 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1135 # Vimeo specific: extract request signature and timestamp
1136 sig = config['request']['signature']
1137 timestamp = config['request']['timestamp']
1139 # Vimeo specific: extract video codec and quality information
1140 # First consider quality, then codecs, then take everything
1141 # TODO bind to format param
1142 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1143 files = { 'hd': [], 'sd': [], 'other': []}
1144 for codec_name, codec_extension in codecs:
1145 if codec_name in config["video"]["files"]:
1146 if 'hd' in config["video"]["files"][codec_name]:
1147 files['hd'].append((codec_name, codec_extension, 'hd'))
1148 elif 'sd' in config["video"]["files"][codec_name]:
1149 files['sd'].append((codec_name, codec_extension, 'sd'))
1151 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1153 for quality in ('hd', 'sd', 'other'):
1154 if len(files[quality]) > 0:
1155 video_quality = files[quality][0][2]
1156 video_codec = files[quality][0][0]
1157 video_extension = files[quality][0][1]
1158 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1161 self._downloader.report_error(u'no known codec found')
1164 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1165 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1170 'uploader': video_uploader,
1171 'uploader_id': video_uploader_id,
1172 'upload_date': video_upload_date,
1173 'title': video_title,
1174 'ext': video_extension,
1175 'thumbnail': video_thumbnail,
1176 'description': video_description,
1180 class ArteTvIE(InfoExtractor):
1181 """arte.tv information extractor."""
1183 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1184 _LIVE_URL = r'index-[0-9]+\.html$'
1186 IE_NAME = u'arte.tv'
1188 def __init__(self, downloader=None):
1189 InfoExtractor.__init__(self, downloader)
1191 def fetch_webpage(self, url):
1192 request = compat_urllib_request.Request(url)
1194 self.report_download_webpage(url)
1195 webpage = compat_urllib_request.urlopen(request).read()
1196 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1197 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1199 except ValueError as err:
1200 self._downloader.report_error(u'Invalid URL: %s' % url)
1204 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1205 page = self.fetch_webpage(url)
1206 mobj = re.search(regex, page, regexFlags)
1210 self._downloader.report_error(u'Invalid URL: %s' % url)
1213 for (i, key, err) in matchTuples:
1214 if mobj.group(i) is None:
1215 self._downloader.report_error(err)
1218 info[key] = mobj.group(i)
1222 def extractLiveStream(self, url):
1223 video_lang = url.split('/')[-4]
1224 info = self.grep_webpage(
1226 r'src="(.*?/videothek_js.*?\.js)',
1229 (1, 'url', u'Invalid URL: %s' % url)
1232 http_host = url.split('/')[2]
1233 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1234 info = self.grep_webpage(
1236 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1237 '(http://.*?\.swf).*?' +
1241 (1, 'path', u'could not extract video path: %s' % url),
1242 (2, 'player', u'could not extract video player: %s' % url),
1243 (3, 'url', u'could not extract video url: %s' % url)
1246 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1248 def extractPlus7Stream(self, url):
1249 video_lang = url.split('/')[-3]
1250 info = self.grep_webpage(
1252 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1255 (1, 'url', u'Invalid URL: %s' % url)
1258 next_url = compat_urllib_parse.unquote(info.get('url'))
1259 info = self.grep_webpage(
1261 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1264 (1, 'url', u'Could not find <video> tag: %s' % url)
1267 next_url = compat_urllib_parse.unquote(info.get('url'))
1269 info = self.grep_webpage(
1271 r'<video id="(.*?)".*?>.*?' +
1272 '<name>(.*?)</name>.*?' +
1273 '<dateVideo>(.*?)</dateVideo>.*?' +
1274 '<url quality="hd">(.*?)</url>',
1277 (1, 'id', u'could not extract video id: %s' % url),
1278 (2, 'title', u'could not extract video title: %s' % url),
1279 (3, 'date', u'could not extract video date: %s' % url),
1280 (4, 'url', u'could not extract video url: %s' % url)
1285 'id': info.get('id'),
1286 'url': compat_urllib_parse.unquote(info.get('url')),
1287 'uploader': u'arte.tv',
1288 'upload_date': info.get('date'),
1289 'title': info.get('title').decode('utf-8'),
1295 def _real_extract(self, url):
1296 video_id = url.split('/')[-1]
1297 self.report_extraction(video_id)
1299 if re.search(self._LIVE_URL, video_id) is not None:
1300 self.extractLiveStream(url)
1303 info = self.extractPlus7Stream(url)
1308 class GenericIE(InfoExtractor):
1309 """Generic last-resort information extractor."""
1312 IE_NAME = u'generic'
1314 def __init__(self, downloader=None):
1315 InfoExtractor.__init__(self, downloader)
1317 def report_download_webpage(self, video_id):
1318 """Report webpage download."""
1319 if not self._downloader.params.get('test', False):
1320 self._downloader.report_warning(u'Falling back on generic information extractor.')
1321 super(GenericIE, self).report_download_webpage(video_id)
1323 def report_following_redirect(self, new_url):
1324 """Report information extraction."""
1325 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1327 def _test_redirect(self, url):
1328 """Check if it is a redirect, like url shorteners, in case return the new url."""
1329 class HeadRequest(compat_urllib_request.Request):
1330 def get_method(self):
1333 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1335 Subclass the HTTPRedirectHandler to make it use our
1336 HeadRequest also on the redirected URL
1338 def redirect_request(self, req, fp, code, msg, headers, newurl):
1339 if code in (301, 302, 303, 307):
1340 newurl = newurl.replace(' ', '%20')
1341 newheaders = dict((k,v) for k,v in req.headers.items()
1342 if k.lower() not in ("content-length", "content-type"))
1343 return HeadRequest(newurl,
1345 origin_req_host=req.get_origin_req_host(),
1348 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1350 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1352 Fallback to GET if HEAD is not allowed (405 HTTP error)
1354 def http_error_405(self, req, fp, code, msg, headers):
1358 newheaders = dict((k,v) for k,v in req.headers.items()
1359 if k.lower() not in ("content-length", "content-type"))
1360 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1362 origin_req_host=req.get_origin_req_host(),
1366 opener = compat_urllib_request.OpenerDirector()
1367 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1368 HTTPMethodFallback, HEADRedirectHandler,
1369 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1370 opener.add_handler(handler())
1372 response = opener.open(HeadRequest(url))
1373 new_url = response.geturl()
1378 self.report_following_redirect(new_url)
1381 def _real_extract(self, url):
1382 new_url = self._test_redirect(url)
1383 if new_url: return [self.url_result(new_url)]
1385 video_id = url.split('/')[-1]
1387 webpage = self._download_webpage(url, video_id)
1388 except ValueError as err:
1389 # since this is the last-resort InfoExtractor, if
1390 # this error is thrown, it'll be thrown here
1391 self._downloader.report_error(u'Invalid URL: %s' % url)
1394 self.report_extraction(video_id)
1395 # Start with something easy: JW Player in SWFObject
1396 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1398 # Broaden the search a little bit
1399 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1401 # Broaden the search a little bit: JWPlayer JS loader
1402 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1404 self._downloader.report_error(u'Invalid URL: %s' % url)
1407 # It's possible that one of the regexes
1408 # matched, but returned an empty group:
1409 if mobj.group(1) is None:
1410 self._downloader.report_error(u'Invalid URL: %s' % url)
1413 video_url = compat_urllib_parse.unquote(mobj.group(1))
1414 video_id = os.path.basename(video_url)
1416 # here's a fun little line of code for you:
1417 video_extension = os.path.splitext(video_id)[1][1:]
1418 video_id = os.path.splitext(video_id)[0]
1420 # it's tempting to parse this further, but you would
1421 # have to take into account all the variations like
1422 # Video Title - Site Name
1423 # Site Name | Video Title
1424 # Video Title - Tagline | Site Name
1425 # and so on and so forth; it's just not practical
1426 mobj = re.search(r'<title>(.*)</title>', webpage)
1428 self._downloader.report_error(u'unable to extract title')
1430 video_title = mobj.group(1)
1432 # video uploader is domain name
1433 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1435 self._downloader.report_error(u'unable to extract title')
1437 video_uploader = mobj.group(1)
1442 'uploader': video_uploader,
1443 'upload_date': None,
1444 'title': video_title,
1445 'ext': video_extension,
1449 class YoutubeSearchIE(InfoExtractor):
1450 """Information Extractor for YouTube search queries."""
1451 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1452 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1453 _max_youtube_results = 1000
1454 IE_NAME = u'youtube:search'
1456 def __init__(self, downloader=None):
1457 InfoExtractor.__init__(self, downloader)
1459 def report_download_page(self, query, pagenum):
1460 """Report attempt to download search page with given number."""
1461 query = query.decode(preferredencoding())
1462 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1464 def _real_extract(self, query):
1465 mobj = re.match(self._VALID_URL, query)
1467 self._downloader.report_error(u'invalid search query "%s"' % query)
1470 prefix, query = query.split(':')
1472 query = query.encode('utf-8')
1474 return self._get_n_results(query, 1)
1475 elif prefix == 'all':
1476 self._get_n_results(query, self._max_youtube_results)
1481 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1483 elif n > self._max_youtube_results:
1484 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1485 n = self._max_youtube_results
1486 return self._get_n_results(query, n)
1487 except ValueError: # parsing prefix as integer fails
1488 return self._get_n_results(query, 1)
1490 def _get_n_results(self, query, n):
1491 """Get a specified number of results for a query"""
1497 while (50 * pagenum) < limit:
1498 self.report_download_page(query, pagenum+1)
1499 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1500 request = compat_urllib_request.Request(result_url)
1502 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1503 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1504 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1506 api_response = json.loads(data)['data']
1508 if not 'items' in api_response:
1509 self._downloader.report_error(u'[youtube] No video results')
1512 new_ids = list(video['id'] for video in api_response['items'])
1513 video_ids += new_ids
1515 limit = min(n, api_response['totalItems'])
1518 if len(video_ids) > n:
1519 video_ids = video_ids[:n]
1520 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1524 class GoogleSearchIE(InfoExtractor):
1525 """Information Extractor for Google Video search queries."""
1526 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1527 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1528 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1529 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1530 _max_google_results = 1000
1531 IE_NAME = u'video.google:search'
1533 def __init__(self, downloader=None):
1534 InfoExtractor.__init__(self, downloader)
1536 def report_download_page(self, query, pagenum):
1537 """Report attempt to download playlist page with given number."""
1538 query = query.decode(preferredencoding())
1539 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1541 def _real_extract(self, query):
1542 mobj = re.match(self._VALID_URL, query)
1544 self._downloader.report_error(u'invalid search query "%s"' % query)
1547 prefix, query = query.split(':')
1549 query = query.encode('utf-8')
1551 self._download_n_results(query, 1)
1553 elif prefix == 'all':
1554 self._download_n_results(query, self._max_google_results)
1560 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1562 elif n > self._max_google_results:
1563 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1564 n = self._max_google_results
1565 self._download_n_results(query, n)
1567 except ValueError: # parsing prefix as integer fails
1568 self._download_n_results(query, 1)
1571 def _download_n_results(self, query, n):
1572 """Downloads a specified number of results for a query"""
1578 self.report_download_page(query, pagenum)
1579 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1580 request = compat_urllib_request.Request(result_url)
1582 page = compat_urllib_request.urlopen(request).read()
1583 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1584 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1587 # Extract video identifiers
1588 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1589 video_id = mobj.group(1)
1590 if video_id not in video_ids:
1591 video_ids.append(video_id)
1592 if len(video_ids) == n:
1593 # Specified n videos reached
1594 for id in video_ids:
1595 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1598 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1599 for id in video_ids:
1600 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1603 pagenum = pagenum + 1
1606 class YahooSearchIE(InfoExtractor):
1607 """Information Extractor for Yahoo! Video search queries."""
1610 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1611 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1612 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1613 _MORE_PAGES_INDICATOR = r'\s*Next'
1614 _max_yahoo_results = 1000
1615 IE_NAME = u'video.yahoo:search'
1617 def __init__(self, downloader=None):
1618 InfoExtractor.__init__(self, downloader)
1620 def report_download_page(self, query, pagenum):
1621 """Report attempt to download playlist page with given number."""
1622 query = query.decode(preferredencoding())
1623 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1625 def _real_extract(self, query):
1626 mobj = re.match(self._VALID_URL, query)
1628 self._downloader.report_error(u'invalid search query "%s"' % query)
1631 prefix, query = query.split(':')
1633 query = query.encode('utf-8')
1635 self._download_n_results(query, 1)
1637 elif prefix == 'all':
1638 self._download_n_results(query, self._max_yahoo_results)
1644 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1646 elif n > self._max_yahoo_results:
1647 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1648 n = self._max_yahoo_results
1649 self._download_n_results(query, n)
1651 except ValueError: # parsing prefix as integer fails
1652 self._download_n_results(query, 1)
1655 def _download_n_results(self, query, n):
1656 """Downloads a specified number of results for a query"""
1659 already_seen = set()
1663 self.report_download_page(query, pagenum)
1664 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1665 request = compat_urllib_request.Request(result_url)
1667 page = compat_urllib_request.urlopen(request).read()
1668 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1669 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1672 # Extract video identifiers
1673 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1674 video_id = mobj.group(1)
1675 if video_id not in already_seen:
1676 video_ids.append(video_id)
1677 already_seen.add(video_id)
1678 if len(video_ids) == n:
1679 # Specified n videos reached
1680 for id in video_ids:
1681 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1684 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1685 for id in video_ids:
1686 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1689 pagenum = pagenum + 1
1692 class YoutubePlaylistIE(InfoExtractor):
1693 """Information Extractor for YouTube playlists."""
1695 _VALID_URL = r"""(?:
1700 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1701 \? (?:.*?&)*? (?:p|a|list)=
1704 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1707 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1709 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1711 IE_NAME = u'youtube:playlist'
1713 def __init__(self, downloader=None):
1714 InfoExtractor.__init__(self, downloader)
1717 def suitable(cls, url):
1718 """Receives a URL and returns True if suitable for this IE."""
1719 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1721 def report_download_page(self, playlist_id, pagenum):
1722 """Report attempt to download playlist page with given number."""
1723 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1725 def _real_extract(self, url):
1726 # Extract playlist id
1727 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1729 self._downloader.report_error(u'invalid url: %s' % url)
1732 # Download playlist videos from API
1733 playlist_id = mobj.group(1) or mobj.group(2)
1738 self.report_download_page(playlist_id, page_num)
1740 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1742 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1743 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1744 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1748 response = json.loads(page)
1749 except ValueError as err:
1750 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1753 if 'feed' not in response:
1754 self._downloader.report_error(u'Got a malformed response from YouTube API')
1756 if 'entry' not in response['feed']:
1757 # Number of videos is a multiple of self._MAX_RESULTS
1760 playlist_title = response['feed']['title']['$t']
1762 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1763 for entry in response['feed']['entry']
1764 if 'content' in entry ]
1766 if len(response['feed']['entry']) < self._MAX_RESULTS:
1770 videos = [v[1] for v in sorted(videos)]
1772 url_results = [self.url_result(url, 'Youtube') for url in videos]
1773 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1776 class YoutubeChannelIE(InfoExtractor):
1777 """Information Extractor for YouTube channels."""
1779 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1780 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1781 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1782 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1783 IE_NAME = u'youtube:channel'
1785 def report_download_page(self, channel_id, pagenum):
1786 """Report attempt to download channel page with given number."""
1787 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1789 def extract_videos_from_page(self, page):
1791 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1792 if mobj.group(1) not in ids_in_page:
1793 ids_in_page.append(mobj.group(1))
1796 def _real_extract(self, url):
1797 # Extract channel id
1798 mobj = re.match(self._VALID_URL, url)
1800 self._downloader.report_error(u'invalid url: %s' % url)
1803 # Download channel page
1804 channel_id = mobj.group(1)
1808 self.report_download_page(channel_id, pagenum)
1809 url = self._TEMPLATE_URL % (channel_id, pagenum)
1810 request = compat_urllib_request.Request(url)
1812 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1813 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1814 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1817 # Extract video identifiers
1818 ids_in_page = self.extract_videos_from_page(page)
1819 video_ids.extend(ids_in_page)
1821 # Download any subsequent channel pages using the json-based channel_ajax query
1822 if self._MORE_PAGES_INDICATOR in page:
1824 pagenum = pagenum + 1
1826 self.report_download_page(channel_id, pagenum)
1827 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1828 request = compat_urllib_request.Request(url)
1830 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1831 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1832 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1835 page = json.loads(page)
1837 ids_in_page = self.extract_videos_from_page(page['content_html'])
1838 video_ids.extend(ids_in_page)
1840 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1843 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1845 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1846 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1847 return [self.playlist_result(url_entries, channel_id)]
1850 class YoutubeUserIE(InfoExtractor):
1851 """Information Extractor for YouTube users."""
1853 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1854 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1855 _GDATA_PAGE_SIZE = 50
1856 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1857 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1858 IE_NAME = u'youtube:user'
1860 def __init__(self, downloader=None):
1861 InfoExtractor.__init__(self, downloader)
1863 def report_download_page(self, username, start_index):
1864 """Report attempt to download user page."""
1865 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1866 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1868 def _real_extract(self, url):
1870 mobj = re.match(self._VALID_URL, url)
1872 self._downloader.report_error(u'invalid url: %s' % url)
1875 username = mobj.group(1)
1877 # Download video ids using YouTube Data API. Result size per
1878 # query is limited (currently to 50 videos) so we need to query
1879 # page by page until there are no video ids - it means we got
1886 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1887 self.report_download_page(username, start_index)
1889 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1892 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1893 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1894 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1897 # Extract video identifiers
1900 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1901 if mobj.group(1) not in ids_in_page:
1902 ids_in_page.append(mobj.group(1))
1904 video_ids.extend(ids_in_page)
1906 # A little optimization - if current page is not
1907 # "full", ie. does not contain PAGE_SIZE video ids then
1908 # we can assume that this page is the last one - there
1909 # are no more ids on further pages - no need to query
1912 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1917 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1918 url_results = [self.url_result(url, 'Youtube') for url in urls]
1919 return [self.playlist_result(url_results, playlist_title = username)]
1922 class BlipTVUserIE(InfoExtractor):
1923 """Information Extractor for blip.tv users."""
1925 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1927 IE_NAME = u'blip.tv:user'
1929 def __init__(self, downloader=None):
1930 InfoExtractor.__init__(self, downloader)
1932 def report_download_page(self, username, pagenum):
1933 """Report attempt to download user page."""
1934 self.to_screen(u'user %s: Downloading video ids from page %d' %
1935 (username, pagenum))
1937 def _real_extract(self, url):
1939 mobj = re.match(self._VALID_URL, url)
1941 self._downloader.report_error(u'invalid url: %s' % url)
1944 username = mobj.group(1)
1946 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1948 request = compat_urllib_request.Request(url)
1951 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1952 mobj = re.search(r'data-users-id="([^"]+)"', page)
1953 page_base = page_base % mobj.group(1)
1954 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1955 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1959 # Download video ids using BlipTV Ajax calls. Result size per
1960 # query is limited (currently to 12 videos) so we need to query
1961 # page by page until there are no video ids - it means we got
1968 self.report_download_page(username, pagenum)
1969 url = page_base + "&page=" + str(pagenum)
1970 request = compat_urllib_request.Request( url )
1972 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1973 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1974 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
1977 # Extract video identifiers
1980 for mobj in re.finditer(r'href="/([^"]+)"', page):
1981 if mobj.group(1) not in ids_in_page:
1982 ids_in_page.append(unescapeHTML(mobj.group(1)))
1984 video_ids.extend(ids_in_page)
1986 # A little optimization - if current page is not
1987 # "full", ie. does not contain PAGE_SIZE video ids then
1988 # we can assume that this page is the last one - there
1989 # are no more ids on further pages - no need to query
1992 if len(ids_in_page) < self._PAGE_SIZE:
1997 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1998 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1999 return [self.playlist_result(url_entries, playlist_title = username)]
2002 class DepositFilesIE(InfoExtractor):
2003 """Information extractor for depositfiles.com"""
2005 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2007 def _real_extract(self, url):
2008 file_id = url.split('/')[-1]
2009 # Rebuild url in english locale
2010 url = 'http://depositfiles.com/en/files/' + file_id
2012 # Retrieve file webpage with 'Free download' button pressed
2013 free_download_indication = { 'gateway_result' : '1' }
2014 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2016 self.report_download_webpage(file_id)
2017 webpage = compat_urllib_request.urlopen(request).read()
2018 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2019 self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2022 # Search for the real file URL
2023 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2024 if (mobj is None) or (mobj.group(1) is None):
2025 # Try to figure out reason of the error.
2026 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2027 if (mobj is not None) and (mobj.group(1) is not None):
2028 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2029 self._downloader.report_error(u'%s' % restriction_message)
2031 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2034 file_url = mobj.group(1)
2035 file_extension = os.path.splitext(file_url)[1][1:]
2037 # Search for file title
2038 mobj = re.search(r'<b title="(.*?)">', webpage)
2040 self._downloader.report_error(u'unable to extract title')
2042 file_title = mobj.group(1).decode('utf-8')
2045 'id': file_id.decode('utf-8'),
2046 'url': file_url.decode('utf-8'),
2048 'upload_date': None,
2049 'title': file_title,
2050 'ext': file_extension.decode('utf-8'),
2054 class FacebookIE(InfoExtractor):
2055 """Information Extractor for Facebook"""
2057 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2058 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2059 _NETRC_MACHINE = 'facebook'
2060 IE_NAME = u'facebook'
2062 def report_login(self):
2063 """Report attempt to log in."""
2064 self.to_screen(u'Logging in')
2066 def _real_initialize(self):
2067 if self._downloader is None:
2072 downloader_params = self._downloader.params
2074 # Attempt to use provided username and password or .netrc data
2075 if downloader_params.get('username', None) is not None:
2076 useremail = downloader_params['username']
2077 password = downloader_params['password']
2078 elif downloader_params.get('usenetrc', False):
2080 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2081 if info is not None:
2085 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2086 except (IOError, netrc.NetrcParseError) as err:
2087 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2090 if useremail is None:
2099 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2102 login_results = compat_urllib_request.urlopen(request).read()
2103 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2104 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2106 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2107 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2110 def _real_extract(self, url):
2111 mobj = re.match(self._VALID_URL, url)
2113 self._downloader.report_error(u'invalid URL: %s' % url)
2115 video_id = mobj.group('ID')
2117 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2118 webpage = self._download_webpage(url, video_id)
2120 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2121 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2122 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2124 raise ExtractorError(u'Cannot parse data')
2125 data = dict(json.loads(m.group(1)))
2126 params_raw = compat_urllib_parse.unquote(data['params'])
2127 params = json.loads(params_raw)
2128 video_data = params['video_data'][0]
2129 video_url = video_data.get('hd_src')
2131 video_url = video_data['sd_src']
2133 raise ExtractorError(u'Cannot find video URL')
2134 video_duration = int(video_data['video_duration'])
2135 thumbnail = video_data['thumbnail_src']
2137 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2139 raise ExtractorError(u'Cannot find title in webpage')
2140 video_title = unescapeHTML(m.group(1))
2144 'title': video_title,
2147 'duration': video_duration,
2148 'thumbnail': thumbnail,
2153 class BlipTVIE(InfoExtractor):
2154 """Information extractor for blip.tv"""
2156 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2157 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2158 IE_NAME = u'blip.tv'
2160 def report_direct_download(self, title):
2161 """Report information extraction."""
2162 self.to_screen(u'%s: Direct download detected' % title)
2164 def _real_extract(self, url):
2165 mobj = re.match(self._VALID_URL, url)
2167 self._downloader.report_error(u'invalid URL: %s' % url)
2170 urlp = compat_urllib_parse_urlparse(url)
2171 if urlp.path.startswith('/play/'):
2172 request = compat_urllib_request.Request(url)
2173 response = compat_urllib_request.urlopen(request)
2174 redirecturl = response.geturl()
2175 rurlp = compat_urllib_parse_urlparse(redirecturl)
2176 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2177 url = 'http://blip.tv/a/a-' + file_id
2178 return self._real_extract(url)
2185 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2186 request = compat_urllib_request.Request(json_url)
2187 request.add_header('User-Agent', 'iTunes/10.6.1')
2188 self.report_extraction(mobj.group(1))
2191 urlh = compat_urllib_request.urlopen(request)
2192 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2193 basename = url.split('/')[-1]
2194 title,ext = os.path.splitext(basename)
2195 title = title.decode('UTF-8')
2196 ext = ext.replace('.', '')
2197 self.report_direct_download(title)
2202 'upload_date': None,
2207 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2208 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2209 if info is None: # Regular URL
2211 json_code_bytes = urlh.read()
2212 json_code = json_code_bytes.decode('utf-8')
2213 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2214 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2218 json_data = json.loads(json_code)
2219 if 'Post' in json_data:
2220 data = json_data['Post']
2224 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2225 video_url = data['media']['url']
2226 umobj = re.match(self._URL_EXT, video_url)
2228 raise ValueError('Can not determine filename extension')
2229 ext = umobj.group(1)
2232 'id': data['item_id'],
2234 'uploader': data['display_name'],
2235 'upload_date': upload_date,
2236 'title': data['title'],
2238 'format': data['media']['mimeType'],
2239 'thumbnail': data['thumbnailUrl'],
2240 'description': data['description'],
2241 'player_url': data['embedUrl'],
2242 'user_agent': 'iTunes/10.6.1',
2244 except (ValueError,KeyError) as err:
2245 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2251 class MyVideoIE(InfoExtractor):
2252 """Information Extractor for myvideo.de."""
2254 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2255 IE_NAME = u'myvideo'
2257 def __init__(self, downloader=None):
2258 InfoExtractor.__init__(self, downloader)
2260 def _real_extract(self,url):
2261 mobj = re.match(self._VALID_URL, url)
2263 self._download.report_error(u'invalid URL: %s' % url)
2266 video_id = mobj.group(1)
2269 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2270 webpage = self._download_webpage(webpage_url, video_id)
2272 self.report_extraction(video_id)
2273 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2276 self._downloader.report_error(u'unable to extract media URL')
2278 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2280 mobj = re.search('<title>([^<]+)</title>', webpage)
2282 self._downloader.report_error(u'unable to extract title')
2285 video_title = mobj.group(1)
2291 'upload_date': None,
2292 'title': video_title,
2296 class ComedyCentralIE(InfoExtractor):
2297 """Information extractor for The Daily Show and Colbert Report """
2299 # urls can be abbreviations like :thedailyshow or :colbert
2300 # urls for episodes like:
2301 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2302 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2303 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2304 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2305 |(https?://)?(www\.)?
2306 (?P<showname>thedailyshow|colbertnation)\.com/
2307 (full-episodes/(?P<episode>.*)|
2309 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2310 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2313 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2315 _video_extensions = {
2323 _video_dimensions = {
2333 def suitable(cls, url):
2334 """Receives a URL and returns True if suitable for this IE."""
2335 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2337 def report_config_download(self, episode_id, media_id):
2338 self.to_screen(u'%s: Downloading configuration for %s' % (episode_id, media_id))
2340 def report_index_download(self, episode_id):
2341 self.to_screen(u'%s: Downloading show index' % episode_id)
2343 def _print_formats(self, formats):
2344 print('Available formats:')
2346 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2349 def _real_extract(self, url):
2350 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2352 self._downloader.report_error(u'invalid URL: %s' % url)
2355 if mobj.group('shortname'):
2356 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2357 url = u'http://www.thedailyshow.com/full-episodes/'
2359 url = u'http://www.colbertnation.com/full-episodes/'
2360 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2361 assert mobj is not None
2363 if mobj.group('clip'):
2364 if mobj.group('showname') == 'thedailyshow':
2365 epTitle = mobj.group('tdstitle')
2367 epTitle = mobj.group('cntitle')
2370 dlNewest = not mobj.group('episode')
2372 epTitle = mobj.group('showname')
2374 epTitle = mobj.group('episode')
2376 req = compat_urllib_request.Request(url)
2377 self.report_extraction(epTitle)
2379 htmlHandle = compat_urllib_request.urlopen(req)
2380 html = htmlHandle.read()
2381 webpage = html.decode('utf-8')
2382 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2383 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2386 url = htmlHandle.geturl()
2387 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2389 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2391 if mobj.group('episode') == '':
2392 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2394 epTitle = mobj.group('episode')
2396 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2398 if len(mMovieParams) == 0:
2399 # The Colbert Report embeds the information in a without
2400 # a URL prefix; so extract the alternate reference
2401 # and then add the URL prefix manually.
2403 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2404 if len(altMovieParams) == 0:
2405 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2408 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2410 uri = mMovieParams[0][1]
2411 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2412 self.report_index_download(epTitle)
2414 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2415 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2416 self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2421 idoc = xml.etree.ElementTree.fromstring(indexXml)
2422 itemEls = idoc.findall('.//item')
2423 for partNum,itemEl in enumerate(itemEls):
2424 mediaId = itemEl.findall('./guid')[0].text
2425 shortMediaId = mediaId.split(':')[-1]
2426 showId = mediaId.split(':')[-2].replace('.com', '')
2427 officialTitle = itemEl.findall('./title')[0].text
2428 officialDate = itemEl.findall('./pubDate')[0].text
2430 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2431 compat_urllib_parse.urlencode({'uri': mediaId}))
2432 configReq = compat_urllib_request.Request(configUrl)
2433 self.report_config_download(epTitle, shortMediaId)
2435 configXml = compat_urllib_request.urlopen(configReq).read()
2436 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2437 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2440 cdoc = xml.etree.ElementTree.fromstring(configXml)
2442 for rendition in cdoc.findall('.//rendition'):
2443 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2447 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2450 if self._downloader.params.get('listformats', None):
2451 self._print_formats([i[0] for i in turls])
2454 # For now, just pick the highest bitrate
2455 format,rtmp_video_url = turls[-1]
2457 # Get the format arg from the arg stream
2458 req_format = self._downloader.params.get('format', None)
2460 # Select format if we can find one
2463 format, rtmp_video_url = f, v
2466 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2468 raise ExtractorError(u'Cannot transform RTMP url')
2469 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2470 video_url = base + m.group('finalid')
2472 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2477 'upload_date': officialDate,
2482 'description': officialTitle,
2484 results.append(info)
2489 class EscapistIE(InfoExtractor):
2490 """Information extractor for The Escapist """
2492 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2493 IE_NAME = u'escapist'
2495 def report_config_download(self, showName):
2496 self.to_screen(u'%s: Downloading configuration' % showName)
2498 def _real_extract(self, url):
2499 mobj = re.match(self._VALID_URL, url)
2501 self._downloader.report_error(u'invalid URL: %s' % url)
2503 showName = mobj.group('showname')
2504 videoId = mobj.group('episode')
2506 self.report_extraction(showName)
2508 webPage = compat_urllib_request.urlopen(url)
2509 webPageBytes = webPage.read()
2510 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2511 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2512 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2513 self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2516 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2517 description = unescapeHTML(descMatch.group(1))
2518 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2519 imgUrl = unescapeHTML(imgMatch.group(1))
2520 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2521 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2522 configUrlMatch = re.search('config=(.*)$', playerUrl)
2523 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2525 self.report_config_download(showName)
2527 configJSON = compat_urllib_request.urlopen(configUrl)
2528 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2529 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2530 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2531 self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2534 # Technically, it's JavaScript, not JSON
2535 configJSON = configJSON.replace("'", '"')
2538 config = json.loads(configJSON)
2539 except (ValueError,) as err:
2540 self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2543 playlist = config['playlist']
2544 videoUrl = playlist[1]['url']
2549 'uploader': showName,
2550 'upload_date': None,
2553 'thumbnail': imgUrl,
2554 'description': description,
2555 'player_url': playerUrl,
2560 class CollegeHumorIE(InfoExtractor):
2561 """Information extractor for collegehumor.com"""
2564 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2565 IE_NAME = u'collegehumor'
2567 def report_manifest(self, video_id):
2568 """Report information extraction."""
2569 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2571 def _real_extract(self, url):
2572 mobj = re.match(self._VALID_URL, url)
2574 self._downloader.report_error(u'invalid URL: %s' % url)
2576 video_id = mobj.group('videoid')
2581 'upload_date': None,
2584 self.report_extraction(video_id)
2585 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2587 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2588 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2589 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2592 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2594 videoNode = mdoc.findall('./video')[0]
2595 info['description'] = videoNode.findall('./description')[0].text
2596 info['title'] = videoNode.findall('./caption')[0].text
2597 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2598 manifest_url = videoNode.findall('./file')[0].text
2600 self._downloader.report_error(u'Invalid metadata XML file')
2603 manifest_url += '?hdcore=2.10.3'
2604 self.report_manifest(video_id)
2606 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2607 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2608 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2611 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2613 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2614 node_id = media_node.attrib['url']
2615 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2616 except IndexError as err:
2617 self._downloader.report_error(u'Invalid manifest file')
2620 url_pr = compat_urllib_parse_urlparse(manifest_url)
2621 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2628 class XVideosIE(InfoExtractor):
2629 """Information extractor for xvideos.com"""
2631 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2632 IE_NAME = u'xvideos'
2634 def _real_extract(self, url):
2635 mobj = re.match(self._VALID_URL, url)
2637 self._downloader.report_error(u'invalid URL: %s' % url)
2639 video_id = mobj.group(1)
2641 webpage = self._download_webpage(url, video_id)
2643 self.report_extraction(video_id)
2647 mobj = re.search(r'flv_url=(.+?)&', webpage)
2649 self._downloader.report_error(u'unable to extract video url')
2651 video_url = compat_urllib_parse.unquote(mobj.group(1))
2655 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2657 self._downloader.report_error(u'unable to extract video title')
2659 video_title = mobj.group(1)
2662 # Extract video thumbnail
2663 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2665 self._downloader.report_error(u'unable to extract video thumbnail')
2667 video_thumbnail = mobj.group(0)
2673 'upload_date': None,
2674 'title': video_title,
2676 'thumbnail': video_thumbnail,
2677 'description': None,
2683 class SoundcloudIE(InfoExtractor):
2684 """Information extractor for soundcloud.com
2685 To access the media, the uid of the song and a stream token
2686 must be extracted from the page source and the script must make
2687 a request to media.soundcloud.com/crossdomain.xml. Then
2688 the media can be grabbed by requesting from an url composed
2689 of the stream token and uid
2692 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2693 IE_NAME = u'soundcloud'
2695 def __init__(self, downloader=None):
2696 InfoExtractor.__init__(self, downloader)
2698 def report_resolve(self, video_id):
2699 """Report information extraction."""
2700 self.to_screen(u'%s: Resolving id' % video_id)
2702 def _real_extract(self, url):
2703 mobj = re.match(self._VALID_URL, url)
2705 self._downloader.report_error(u'invalid URL: %s' % url)
2708 # extract uploader (which is in the url)
2709 uploader = mobj.group(1)
2710 # extract simple title (uploader + slug of song title)
2711 slug_title = mobj.group(2)
2712 simple_title = uploader + u'-' + slug_title
2714 self.report_resolve('%s/%s' % (uploader, slug_title))
2716 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2717 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2718 request = compat_urllib_request.Request(resolv_url)
2720 info_json_bytes = compat_urllib_request.urlopen(request).read()
2721 info_json = info_json_bytes.decode('utf-8')
2722 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2723 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2726 info = json.loads(info_json)
2727 video_id = info['id']
2728 self.report_extraction('%s/%s' % (uploader, slug_title))
2730 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2731 request = compat_urllib_request.Request(streams_url)
2733 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2734 stream_json = stream_json_bytes.decode('utf-8')
2735 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2736 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2739 streams = json.loads(stream_json)
2740 mediaURL = streams['http_mp3_128_url']
2745 'uploader': info['user']['username'],
2746 'upload_date': info['created_at'],
2747 'title': info['title'],
2749 'description': info['description'],
2752 class SoundcloudSetIE(InfoExtractor):
2753 """Information extractor for soundcloud.com sets
2754 To access the media, the uid of the song and a stream token
2755 must be extracted from the page source and the script must make
2756 a request to media.soundcloud.com/crossdomain.xml. Then
2757 the media can be grabbed by requesting from an url composed
2758 of the stream token and uid
2761 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2762 IE_NAME = u'soundcloud'
2764 def __init__(self, downloader=None):
2765 InfoExtractor.__init__(self, downloader)
2767 def report_resolve(self, video_id):
2768 """Report information extraction."""
2769 self.to_screen(u'%s: Resolving id' % video_id)
2771 def _real_extract(self, url):
2772 mobj = re.match(self._VALID_URL, url)
2774 self._downloader.report_error(u'invalid URL: %s' % url)
2777 # extract uploader (which is in the url)
2778 uploader = mobj.group(1)
2779 # extract simple title (uploader + slug of song title)
2780 slug_title = mobj.group(2)
2781 simple_title = uploader + u'-' + slug_title
2783 self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2785 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2786 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2787 request = compat_urllib_request.Request(resolv_url)
2789 info_json_bytes = compat_urllib_request.urlopen(request).read()
2790 info_json = info_json_bytes.decode('utf-8')
2791 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2792 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2796 info = json.loads(info_json)
2797 if 'errors' in info:
2798 for err in info['errors']:
2799 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2802 for track in info['tracks']:
2803 video_id = track['id']
2804 self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2806 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2807 request = compat_urllib_request.Request(streams_url)
2809 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2810 stream_json = stream_json_bytes.decode('utf-8')
2811 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2812 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2815 streams = json.loads(stream_json)
2816 mediaURL = streams['http_mp3_128_url']
2821 'uploader': track['user']['username'],
2822 'upload_date': track['created_at'],
2823 'title': track['title'],
2825 'description': track['description'],
2830 class InfoQIE(InfoExtractor):
2831 """Information extractor for infoq.com"""
2832 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2834 def _real_extract(self, url):
2835 mobj = re.match(self._VALID_URL, url)
2837 self._downloader.report_error(u'invalid URL: %s' % url)
2840 webpage = self._download_webpage(url, video_id=url)
2841 self.report_extraction(url)
2844 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2846 self._downloader.report_error(u'unable to extract video url')
2848 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2849 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2852 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2854 self._downloader.report_error(u'unable to extract video title')
2856 video_title = mobj.group(1)
2858 # Extract description
2859 video_description = u'No description available.'
2860 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2861 if mobj is not None:
2862 video_description = mobj.group(1)
2864 video_filename = video_url.split('/')[-1]
2865 video_id, extension = video_filename.split('.')
2871 'upload_date': None,
2872 'title': video_title,
2873 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2875 'description': video_description,
2880 class MixcloudIE(InfoExtractor):
2881 """Information extractor for www.mixcloud.com"""
2883 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2884 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2885 IE_NAME = u'mixcloud'
2887 def __init__(self, downloader=None):
2888 InfoExtractor.__init__(self, downloader)
2890 def report_download_json(self, file_id):
2891 """Report JSON download."""
2892 self.to_screen(u'Downloading json')
2894 def get_urls(self, jsonData, fmt, bitrate='best'):
2895 """Get urls from 'audio_formats' section in json"""
2898 bitrate_list = jsonData[fmt]
2899 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2900 bitrate = max(bitrate_list) # select highest
2902 url_list = jsonData[fmt][bitrate]
2903 except TypeError: # we have no bitrate info.
2904 url_list = jsonData[fmt]
2907 def check_urls(self, url_list):
2908 """Returns 1st active url from list"""
2909 for url in url_list:
2911 compat_urllib_request.urlopen(url)
2913 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2918 def _print_formats(self, formats):
2919 print('Available formats:')
2920 for fmt in formats.keys():
2921 for b in formats[fmt]:
2923 ext = formats[fmt][b][0]
2924 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2925 except TypeError: # we have no bitrate info
2926 ext = formats[fmt][0]
2927 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2930 def _real_extract(self, url):
2931 mobj = re.match(self._VALID_URL, url)
2933 self._downloader.report_error(u'invalid URL: %s' % url)
2935 # extract uploader & filename from url
2936 uploader = mobj.group(1).decode('utf-8')
2937 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2939 # construct API request
2940 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2941 # retrieve .json file with links to files
2942 request = compat_urllib_request.Request(file_url)
2944 self.report_download_json(file_url)
2945 jsonData = compat_urllib_request.urlopen(request).read()
2946 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2947 self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
2951 json_data = json.loads(jsonData)
2952 player_url = json_data['player_swf_url']
2953 formats = dict(json_data['audio_formats'])
2955 req_format = self._downloader.params.get('format', None)
2958 if self._downloader.params.get('listformats', None):
2959 self._print_formats(formats)
2962 if req_format is None or req_format == 'best':
2963 for format_param in formats.keys():
2964 url_list = self.get_urls(formats, format_param)
2966 file_url = self.check_urls(url_list)
2967 if file_url is not None:
2970 if req_format not in formats:
2971 self._downloader.report_error(u'format is not available')
2974 url_list = self.get_urls(formats, req_format)
2975 file_url = self.check_urls(url_list)
2976 format_param = req_format
2979 'id': file_id.decode('utf-8'),
2980 'url': file_url.decode('utf-8'),
2981 'uploader': uploader.decode('utf-8'),
2982 'upload_date': None,
2983 'title': json_data['name'],
2984 'ext': file_url.split('.')[-1].decode('utf-8'),
2985 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2986 'thumbnail': json_data['thumbnail_url'],
2987 'description': json_data['description'],
2988 'player_url': player_url.decode('utf-8'),
2991 class StanfordOpenClassroomIE(InfoExtractor):
2992 """Information extractor for Stanford's Open ClassRoom"""
2994 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2995 IE_NAME = u'stanfordoc'
2997 def _real_extract(self, url):
2998 mobj = re.match(self._VALID_URL, url)
3000 raise ExtractorError(u'Invalid URL: %s' % url)
3002 if mobj.group('course') and mobj.group('video'): # A specific video
3003 course = mobj.group('course')
3004 video = mobj.group('video')
3006 'id': course + '_' + video,
3008 'upload_date': None,
3011 self.report_extraction(info['id'])
3012 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3013 xmlUrl = baseUrl + video + '.xml'
3015 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3016 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3017 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3019 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3021 info['title'] = mdoc.findall('./title')[0].text
3022 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3024 self._downloader.report_error(u'Invalid metadata XML file')
3026 info['ext'] = info['url'].rpartition('.')[2]
3028 elif mobj.group('course'): # A course page
3029 course = mobj.group('course')
3034 'upload_date': None,
3037 coursepage = self._download_webpage(url, info['id'],
3038 note='Downloading course info page',
3039 errnote='Unable to download course info page')
3041 m = re.search('<h1>([^<]+)</h1>', coursepage)
3043 info['title'] = unescapeHTML(m.group(1))
3045 info['title'] = info['id']
3047 m = re.search('<description>([^<]+)</description>', coursepage)
3049 info['description'] = unescapeHTML(m.group(1))
3051 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3054 'type': 'reference',
3055 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3059 for entry in info['list']:
3060 assert entry['type'] == 'reference'
3061 results += self.extract(entry['url'])
3065 'id': 'Stanford OpenClassroom',
3068 'upload_date': None,
3071 self.report_download_webpage(info['id'])
3072 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3074 rootpage = compat_urllib_request.urlopen(rootURL).read()
3075 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3076 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3079 info['title'] = info['id']
3081 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3084 'type': 'reference',
3085 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3090 for entry in info['list']:
3091 assert entry['type'] == 'reference'
3092 results += self.extract(entry['url'])
3095 class MTVIE(InfoExtractor):
3096 """Information extractor for MTV.com"""
3098 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3101 def _real_extract(self, url):
3102 mobj = re.match(self._VALID_URL, url)
3104 self._downloader.report_error(u'invalid URL: %s' % url)
3106 if not mobj.group('proto'):
3107 url = 'http://' + url
3108 video_id = mobj.group('videoid')
3110 webpage = self._download_webpage(url, video_id)
3112 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3114 self._downloader.report_error(u'unable to extract song name')
3116 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3117 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3119 self._downloader.report_error(u'unable to extract performer')
3121 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3122 video_title = performer + ' - ' + song_name
3124 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3126 self._downloader.report_error(u'unable to mtvn_uri')
3128 mtvn_uri = mobj.group(1)
3130 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3132 self._downloader.report_error(u'unable to extract content id')
3134 content_id = mobj.group(1)
3136 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3137 self.report_extraction(video_id)
3138 request = compat_urllib_request.Request(videogen_url)
3140 metadataXml = compat_urllib_request.urlopen(request).read()
3141 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3142 self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3145 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3146 renditions = mdoc.findall('.//rendition')
3148 # For now, always pick the highest quality.
3149 rendition = renditions[-1]
3152 _,_,ext = rendition.attrib['type'].partition('/')
3153 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3154 video_url = rendition.find('./src').text
3156 self._downloader.report_error('Invalid rendition field.')
3162 'uploader': performer,
3163 'upload_date': None,
3164 'title': video_title,
3172 class YoukuIE(InfoExtractor):
3173 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3176 nowTime = int(time.time() * 1000)
3177 random1 = random.randint(1000,1998)
3178 random2 = random.randint(1000,9999)
3180 return "%d%d%d" %(nowTime,random1,random2)
3182 def _get_file_ID_mix_string(self, seed):
3184 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3186 for i in range(len(source)):
3187 seed = (seed * 211 + 30031 ) % 65536
3188 index = math.floor(seed / 65536 * len(source) )
3189 mixed.append(source[int(index)])
3190 source.remove(source[int(index)])
3191 #return ''.join(mixed)
3194 def _get_file_id(self, fileId, seed):
3195 mixed = self._get_file_ID_mix_string(seed)
3196 ids = fileId.split('*')
3200 realId.append(mixed[int(ch)])
3201 return ''.join(realId)
3203 def _real_extract(self, url):
3204 mobj = re.match(self._VALID_URL, url)
3206 self._downloader.report_error(u'invalid URL: %s' % url)
3208 video_id = mobj.group('ID')
3210 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3212 request = compat_urllib_request.Request(info_url, None, std_headers)
3214 self.report_download_webpage(video_id)
3215 jsondata = compat_urllib_request.urlopen(request).read()
3216 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3217 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3220 self.report_extraction(video_id)
3222 jsonstr = jsondata.decode('utf-8')
3223 config = json.loads(jsonstr)
3225 video_title = config['data'][0]['title']
3226 seed = config['data'][0]['seed']
3228 format = self._downloader.params.get('format', None)
3229 supported_format = list(config['data'][0]['streamfileids'].keys())
3231 if format is None or format == 'best':
3232 if 'hd2' in supported_format:
3237 elif format == 'worst':
3245 fileid = config['data'][0]['streamfileids'][format]
3246 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3247 except (UnicodeDecodeError, ValueError, KeyError):
3248 self._downloader.report_error(u'unable to extract info section')
3252 sid = self._gen_sid()
3253 fileid = self._get_file_id(fileid, seed)
3255 #column 8,9 of fileid represent the segment number
3256 #fileid[7:9] should be changed
3257 for index, key in enumerate(keys):
3259 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3260 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3263 'id': '%s_part%02d' % (video_id, index),
3264 'url': download_url,
3266 'upload_date': None,
3267 'title': video_title,
3270 files_info.append(info)
3275 class XNXXIE(InfoExtractor):
3276 """Information extractor for xnxx.com"""
3278 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3280 VIDEO_URL_RE = r'flv_url=(.*?)&'
3281 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3282 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3284 def _real_extract(self, url):
3285 mobj = re.match(self._VALID_URL, url)
3287 self._downloader.report_error(u'invalid URL: %s' % url)
3289 video_id = mobj.group(1)
3291 self.report_download_webpage(video_id)
3293 # Get webpage content
3295 webpage_bytes = compat_urllib_request.urlopen(url).read()
3296 webpage = webpage_bytes.decode('utf-8')
3297 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3298 self._downloader.report_error(u'unable to download video webpage: %s' % err)
3301 result = re.search(self.VIDEO_URL_RE, webpage)
3303 self._downloader.report_error(u'unable to extract video url')
3305 video_url = compat_urllib_parse.unquote(result.group(1))
3307 result = re.search(self.VIDEO_TITLE_RE, webpage)
3309 self._downloader.report_error(u'unable to extract video title')
3311 video_title = result.group(1)
3313 result = re.search(self.VIDEO_THUMB_RE, webpage)
3315 self._downloader.report_error(u'unable to extract video thumbnail')
3317 video_thumbnail = result.group(1)
3323 'upload_date': None,
3324 'title': video_title,
3326 'thumbnail': video_thumbnail,
3327 'description': None,
3331 class GooglePlusIE(InfoExtractor):
3332 """Information extractor for plus.google.com."""
3334 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3335 IE_NAME = u'plus.google'
3337 def __init__(self, downloader=None):
3338 InfoExtractor.__init__(self, downloader)
3340 def report_extract_entry(self, url):
3341 """Report downloading extry"""
3342 self.to_screen(u'Downloading entry: %s' % url)
3344 def report_date(self, upload_date):
3345 """Report downloading extry"""
3346 self.to_screen(u'Entry date: %s' % upload_date)
3348 def report_uploader(self, uploader):
3349 """Report downloading extry"""
3350 self.to_screen(u'Uploader: %s' % uploader)
3352 def report_title(self, video_title):
3353 """Report downloading extry"""
3354 self.to_screen(u'Title: %s' % video_title)
3356 def report_extract_vid_page(self, video_page):
3357 """Report information extraction."""
3358 self.to_screen(u'Extracting video page: %s' % video_page)
3360 def _real_extract(self, url):
3361 # Extract id from URL
3362 mobj = re.match(self._VALID_URL, url)
3364 self._downloader.report_error(u'Invalid URL: %s' % url)
3367 post_url = mobj.group(0)
3368 video_id = mobj.group(1)
3370 video_extension = 'flv'
3372 # Step 1, Retrieve post webpage to extract further information
3373 self.report_extract_entry(post_url)
3374 request = compat_urllib_request.Request(post_url)
3376 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3377 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3378 self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3381 # Extract update date
3383 pattern = 'title="Timestamp">(.*?)</a>'
3384 mobj = re.search(pattern, webpage)
3386 upload_date = mobj.group(1)
3387 # Convert timestring to a format suitable for filename
3388 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3389 upload_date = upload_date.strftime('%Y%m%d')
3390 self.report_date(upload_date)
3394 pattern = r'rel\="author".*?>(.*?)</a>'
3395 mobj = re.search(pattern, webpage)
3397 uploader = mobj.group(1)
3398 self.report_uploader(uploader)
3401 # Get the first line for title
3403 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3404 mobj = re.search(pattern, webpage)
3406 video_title = mobj.group(1)
3407 self.report_title(video_title)
3409 # Step 2, Stimulate clicking the image box to launch video
3410 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3411 mobj = re.search(pattern, webpage)
3413 self._downloader.report_error(u'unable to extract video page URL')
3415 video_page = mobj.group(1)
3416 request = compat_urllib_request.Request(video_page)
3418 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3419 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3420 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3422 self.report_extract_vid_page(video_page)
3425 # Extract video links on video page
3426 """Extract video links of all sizes"""
3427 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3428 mobj = re.findall(pattern, webpage)
3430 self._downloader.report_error(u'unable to extract video links')
3432 # Sort in resolution
3433 links = sorted(mobj)
3435 # Choose the lowest of the sort, i.e. highest resolution
3436 video_url = links[-1]
3437 # Only get the url. The resolution part in the tuple has no use anymore
3438 video_url = video_url[-1]
3439 # Treat escaped \u0026 style hex
3441 video_url = video_url.decode("unicode_escape")
3442 except AttributeError: # Python 3
3443 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3449 'uploader': uploader,
3450 'upload_date': upload_date,
3451 'title': video_title,
3452 'ext': video_extension,
3455 class NBAIE(InfoExtractor):
3456 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3459 def _real_extract(self, url):
3460 mobj = re.match(self._VALID_URL, url)
3462 self._downloader.report_error(u'invalid URL: %s' % url)
3465 video_id = mobj.group(1)
3466 if video_id.endswith('/index.html'):
3467 video_id = video_id[:-len('/index.html')]
3469 webpage = self._download_webpage(url, video_id)
3471 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3472 def _findProp(rexp, default=None):
3473 m = re.search(rexp, webpage)
3475 return unescapeHTML(m.group(1))
3479 shortened_video_id = video_id.rpartition('/')[2]
3480 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3482 'id': shortened_video_id,
3486 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3487 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3491 class JustinTVIE(InfoExtractor):
3492 """Information extractor for justin.tv and twitch.tv"""
3493 # TODO: One broadcast may be split into multiple videos. The key
3494 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3495 # starts at 1 and increases. Can we treat all parts as one video?
3497 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3498 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3499 _JUSTIN_PAGE_LIMIT = 100
3500 IE_NAME = u'justin.tv'
3502 def report_download_page(self, channel, offset):
3503 """Report attempt to download a single page of videos."""
3504 self.to_screen(u'%s: Downloading video information from %d to %d' %
3505 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3507 # Return count of items, list of *valid* items
3508 def _parse_page(self, url):
3510 urlh = compat_urllib_request.urlopen(url)
3511 webpage_bytes = urlh.read()
3512 webpage = webpage_bytes.decode('utf-8', 'ignore')
3513 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3514 self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3517 response = json.loads(webpage)
3518 if type(response) != list:
3519 error_text = response.get('error', 'unknown error')
3520 self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3523 for clip in response:
3524 video_url = clip['video_file_url']
3526 video_extension = os.path.splitext(video_url)[1][1:]
3527 video_date = re.sub('-', '', clip['start_time'][:10])
3528 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3529 video_id = clip['id']
3530 video_title = clip.get('title', video_id)
3534 'title': video_title,
3535 'uploader': clip.get('channel_name', video_uploader_id),
3536 'uploader_id': video_uploader_id,
3537 'upload_date': video_date,
3538 'ext': video_extension,
3540 return (len(response), info)
3542 def _real_extract(self, url):
3543 mobj = re.match(self._VALID_URL, url)
3545 self._downloader.report_error(u'invalid URL: %s' % url)
3548 api = 'http://api.justin.tv'
3549 video_id = mobj.group(mobj.lastindex)
3551 if mobj.lastindex == 1:
3553 api += '/channel/archives/%s.json'
3555 api += '/broadcast/by_archive/%s.json'
3556 api = api % (video_id,)
3558 self.report_extraction(video_id)
3562 limit = self._JUSTIN_PAGE_LIMIT
3565 self.report_download_page(video_id, offset)
3566 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3567 page_count, page_info = self._parse_page(page_url)
3568 info.extend(page_info)
3569 if not paged or page_count != limit:
3574 class FunnyOrDieIE(InfoExtractor):
3575 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3577 def _real_extract(self, url):
3578 mobj = re.match(self._VALID_URL, url)
3580 self._downloader.report_error(u'invalid URL: %s' % url)
3583 video_id = mobj.group('id')
3584 webpage = self._download_webpage(url, video_id)
3586 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3588 self._downloader.report_error(u'unable to find video information')
3589 video_url = unescapeHTML(m.group('url'))
3591 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3593 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3595 self._downloader.report_error(u'Cannot find video title')
3596 title = clean_html(m.group('title'))
3598 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3600 desc = unescapeHTML(m.group('desc'))
3609 'description': desc,
3613 class SteamIE(InfoExtractor):
3614 _VALID_URL = r"""http://store.steampowered.com/
3615 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3617 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3621 def suitable(cls, url):
3622 """Receives a URL and returns True if suitable for this IE."""
3623 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3625 def _real_extract(self, url):
3626 m = re.match(self._VALID_URL, url, re.VERBOSE)
3627 gameID = m.group('gameID')
3628 videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3629 self.report_age_confirmation()
3630 webpage = self._download_webpage(videourl, gameID)
3631 game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3633 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3634 mweb = re.finditer(urlRE, webpage)
3635 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3636 titles = re.finditer(namesRE, webpage)
3637 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3638 thumbs = re.finditer(thumbsRE, webpage)
3640 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3641 video_id = vid.group('videoID')
3642 title = vtitle.group('videoName')
3643 video_url = vid.group('videoURL')
3644 video_thumb = thumb.group('thumbnail')
3646 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3651 'title': unescapeHTML(title),
3652 'thumbnail': video_thumb
3655 return [self.playlist_result(videos, gameID, game_title)]
3657 class UstreamIE(InfoExtractor):
3658 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3659 IE_NAME = u'ustream'
3661 def _real_extract(self, url):
3662 m = re.match(self._VALID_URL, url)
3663 video_id = m.group('videoID')
3664 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3665 webpage = self._download_webpage(url, video_id)
3666 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3667 title = m.group('title')
3668 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3669 uploader = m.group('uploader')
3675 'uploader': uploader
3679 class WorldStarHipHopIE(InfoExtractor):
3680 _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3681 IE_NAME = u'WorldStarHipHop'
3683 def _real_extract(self, url):
3684 _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3686 webpage_src = compat_urllib_request.urlopen(url).read()
3687 webpage_src = webpage_src.decode('utf-8')
3689 mobj = re.search(_src_url, webpage_src)
3691 m = re.match(self._VALID_URL, url)
3692 video_id = m.group('id')
3694 if mobj is not None:
3695 video_url = mobj.group()
3696 if 'mp4' in video_url:
3701 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3704 _title = r"""<title>(.*)</title>"""
3706 mobj = re.search(_title, webpage_src)
3708 if mobj is not None:
3709 title = mobj.group(1)
3711 title = 'World Start Hip Hop - %s' % time.ctime()
3713 _thumbnail = r"""rel="image_src" href="(.*)" />"""
3714 mobj = re.search(_thumbnail, webpage_src)
3716 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3717 if mobj is not None:
3718 thumbnail = mobj.group(1)
3720 _title = r"""candytitles.*>(.*)</span>"""
3721 mobj = re.search(_title, webpage_src)
3722 if mobj is not None:
3723 title = mobj.group(1)
3730 'thumbnail' : thumbnail,
3735 class RBMARadioIE(InfoExtractor):
3736 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3738 def _real_extract(self, url):
3739 m = re.match(self._VALID_URL, url)
3740 video_id = m.group('videoID')
3742 webpage = self._download_webpage(url, video_id)
3743 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3745 raise ExtractorError(u'Cannot find metadata')
3746 json_data = m.group(1)
3749 data = json.loads(json_data)
3750 except ValueError as e:
3751 raise ExtractorError(u'Invalid JSON: ' + str(e))
3753 video_url = data['akamai_url'] + '&cbr=256'
3754 url_parts = compat_urllib_parse_urlparse(video_url)
3755 video_ext = url_parts.path.rpartition('.')[2]
3760 'title': data['title'],
3761 'description': data.get('teaser_text'),
3762 'location': data.get('country_of_origin'),
3763 'uploader': data.get('host', {}).get('name'),
3764 'uploader_id': data.get('host', {}).get('slug'),
3765 'thumbnail': data.get('image', {}).get('large_url_2x'),
3766 'duration': data.get('duration'),
3771 class YouPornIE(InfoExtractor):
3772 """Information extractor for youporn.com."""
3773 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3775 def _print_formats(self, formats):
3776 """Print all available formats"""
3777 print(u'Available formats:')
3778 print(u'ext\t\tformat')
3779 print(u'---------------------------------')
3780 for format in formats:
3781 print(u'%s\t\t%s' % (format['ext'], format['format']))
3783 def _specific(self, req_format, formats):
3785 if(x["format"]==req_format):
3789 def _real_extract(self, url):
3790 mobj = re.match(self._VALID_URL, url)
3792 self._downloader.report_error(u'invalid URL: %s' % url)
3795 video_id = mobj.group('videoid')
3797 req = compat_urllib_request.Request(url)
3798 req.add_header('Cookie', 'age_verified=1')
3799 webpage = self._download_webpage(req, video_id)
3801 # Get the video title
3802 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3804 raise ExtractorError(u'Unable to extract video title')
3805 video_title = result.group('title').strip()
3807 # Get the video date
3808 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3810 self._downloader.report_warning(u'unable to extract video date')
3813 upload_date = result.group('date').strip()
3815 # Get the video uploader
3816 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3818 self._downloader.report_warning(u'unable to extract uploader')
3819 video_uploader = None
3821 video_uploader = result.group('uploader').strip()
3822 video_uploader = clean_html( video_uploader )
3824 # Get all of the formats available
3825 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3826 result = re.search(DOWNLOAD_LIST_RE, webpage)
3828 raise ExtractorError(u'Unable to extract download list')
3829 download_list_html = result.group('download_list').strip()
3831 # Get all of the links from the page
3832 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3833 links = re.findall(LINK_RE, download_list_html)
3834 if(len(links) == 0):
3835 raise ExtractorError(u'ERROR: no known formats available for video')
3837 self.to_screen(u'Links found: %d' % len(links))
3842 # A link looks like this:
3843 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3844 # A path looks like this:
3845 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3846 video_url = unescapeHTML( link )
3847 path = compat_urllib_parse_urlparse( video_url ).path
3848 extension = os.path.splitext( path )[1][1:]
3849 format = path.split('/')[4].split('_')[:2]
3852 format = "-".join( format )
3853 title = u'%s-%s-%s' % (video_title, size, bitrate)
3858 'uploader': video_uploader,
3859 'upload_date': upload_date,
3864 'description': None,
3868 if self._downloader.params.get('listformats', None):
3869 self._print_formats(formats)
3872 req_format = self._downloader.params.get('format', None)
3873 self.to_screen(u'Format: %s' % req_format)
3875 if req_format is None or req_format == 'best':
3877 elif req_format == 'worst':
3878 return [formats[-1]]
3879 elif req_format in ('-1', 'all'):
3882 format = self._specific( req_format, formats )
3884 self._downloader.report_error(u'requested format not available')
3890 class PornotubeIE(InfoExtractor):
3891 """Information extractor for pornotube.com."""
3892 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3894 def _real_extract(self, url):
3895 mobj = re.match(self._VALID_URL, url)
3897 self._downloader.report_error(u'invalid URL: %s' % url)
3900 video_id = mobj.group('videoid')
3901 video_title = mobj.group('title')
3903 # Get webpage content
3904 webpage = self._download_webpage(url, video_id)
3907 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3908 result = re.search(VIDEO_URL_RE, webpage)
3910 self._downloader.report_error(u'unable to extract video url')
3912 video_url = compat_urllib_parse.unquote(result.group('url'))
3914 #Get the uploaded date
3915 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3916 result = re.search(VIDEO_UPLOADED_RE, webpage)
3918 self._downloader.report_error(u'unable to extract video title')
3920 upload_date = result.group('date')
3922 info = {'id': video_id,
3925 'upload_date': upload_date,
3926 'title': video_title,
3932 class YouJizzIE(InfoExtractor):
3933 """Information extractor for youjizz.com."""
3934 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3936 def _real_extract(self, url):
3937 mobj = re.match(self._VALID_URL, url)
3939 self._downloader.report_error(u'invalid URL: %s' % url)
3942 video_id = mobj.group('videoid')
3944 # Get webpage content
3945 webpage = self._download_webpage(url, video_id)
3947 # Get the video title
3948 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3950 raise ExtractorError(u'ERROR: unable to extract video title')
3951 video_title = result.group('title').strip()
3953 # Get the embed page
3954 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3956 raise ExtractorError(u'ERROR: unable to extract embed page')
3958 embed_page_url = result.group(0).strip()
3959 video_id = result.group('videoid')
3961 webpage = self._download_webpage(embed_page_url, video_id)
3964 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3966 raise ExtractorError(u'ERROR: unable to extract video url')
3967 video_url = result.group('source')
3969 info = {'id': video_id,
3971 'title': video_title,
3974 'player_url': embed_page_url}
3978 class EightTracksIE(InfoExtractor):
3980 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3982 def _real_extract(self, url):
3983 mobj = re.match(self._VALID_URL, url)
3985 raise ExtractorError(u'Invalid URL: %s' % url)
3986 playlist_id = mobj.group('id')
3988 webpage = self._download_webpage(url, playlist_id)
3990 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3992 raise ExtractorError(u'Cannot find trax information')
3993 json_like = m.group(1)
3994 data = json.loads(json_like)
3996 session = str(random.randint(0, 1000000000))
3998 track_count = data['tracks_count']
3999 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4000 next_url = first_url
4002 for i in itertools.count():
4003 api_json = self._download_webpage(next_url, playlist_id,
4004 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4005 errnote=u'Failed to download song information')
4006 api_data = json.loads(api_json)
4007 track_data = api_data[u'set']['track']
4009 'id': track_data['id'],
4010 'url': track_data['track_file_stream_url'],
4011 'title': track_data['performer'] + u' - ' + track_data['name'],
4012 'raw_title': track_data['name'],
4013 'uploader_id': data['user']['login'],
4017 if api_data['set']['at_last_track']:
4019 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4022 class KeekIE(InfoExtractor):
4023 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4026 def _real_extract(self, url):
4027 m = re.match(self._VALID_URL, url)
4028 video_id = m.group('videoID')
4029 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4030 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4031 webpage = self._download_webpage(url, video_id)
4032 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4033 title = unescapeHTML(m.group('title'))
4034 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4035 uploader = clean_html(m.group('uploader'))
4041 'thumbnail': thumbnail,
4042 'uploader': uploader
4046 class TEDIE(InfoExtractor):
4047 _VALID_URL=r'''http://www.ted.com/
4049 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4051 ((?P<type_talk>talks)) # We have a simple talk
4053 /(?P<name>\w+) # Here goes the name and then ".html"
4057 def suitable(cls, url):
4058 """Receives a URL and returns True if suitable for this IE."""
4059 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4061 def _real_extract(self, url):
4062 m=re.match(self._VALID_URL, url, re.VERBOSE)
4063 if m.group('type_talk'):
4064 return [self._talk_info(url)]
4066 playlist_id=m.group('playlist_id')
4067 name=m.group('name')
4068 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
4069 return [self._playlist_videos_info(url,name,playlist_id)]
4071 def _talk_video_link(self,mediaSlug):
4072 '''Returns the video link for that mediaSlug'''
4073 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4075 def _playlist_videos_info(self,url,name,playlist_id=0):
4076 '''Returns the videos of the playlist'''
4078 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4079 ([.\s]*?)data-playlist_item_id="(\d+)"
4080 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4082 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4083 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4084 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4085 m_names=re.finditer(video_name_RE,webpage)
4087 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
4088 m_playlist = re.search(playlist_RE, webpage)
4089 playlist_title = m_playlist.group('playlist_title')
4091 playlist_entries = []
4092 for m_video, m_name in zip(m_videos,m_names):
4093 video_id=m_video.group('video_id')
4094 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4095 playlist_entries.append(self.url_result(talk_url, 'TED'))
4096 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
4098 def _talk_info(self, url, video_id=0):
4099 """Return the video for the talk in the url"""
4100 m=re.match(self._VALID_URL, url,re.VERBOSE)
4101 videoName=m.group('name')
4102 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4103 # If the url includes the language we get the title translated
4104 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4105 title=re.search(title_RE, webpage).group('title')
4106 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4107 "id":(?P<videoID>[\d]+).*?
4108 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4109 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4110 thumb_match=re.search(thumb_RE,webpage)
4111 info_match=re.search(info_RE,webpage,re.VERBOSE)
4112 video_id=info_match.group('videoID')
4113 mediaSlug=info_match.group('mediaSlug')
4114 video_url=self._talk_video_link(mediaSlug)
4120 'thumbnail': thumb_match.group('thumbnail')
4124 class MySpassIE(InfoExtractor):
4125 _VALID_URL = r'http://www.myspass.de/.*'
4127 def _real_extract(self, url):
4128 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4130 # video id is the last path element of the URL
4131 # usually there is a trailing slash, so also try the second but last
4132 url_path = compat_urllib_parse_urlparse(url).path
4133 url_parent_path, video_id = os.path.split(url_path)
4135 _, video_id = os.path.split(url_parent_path)
4138 metadata_url = META_DATA_URL_TEMPLATE % video_id
4139 metadata_text = self._download_webpage(metadata_url, video_id)
4140 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4142 # extract values from metadata
4143 url_flv_el = metadata.find('url_flv')
4144 if url_flv_el is None:
4145 self._downloader.report_error(u'unable to extract download url')
4147 video_url = url_flv_el.text
4148 extension = os.path.splitext(video_url)[1][1:]
4149 title_el = metadata.find('title')
4150 if title_el is None:
4151 self._downloader.report_error(u'unable to extract title')
4153 title = title_el.text
4154 format_id_el = metadata.find('format_id')
4155 if format_id_el is None:
4158 format = format_id_el.text
4159 description_el = metadata.find('description')
4160 if description_el is not None:
4161 description = description_el.text
4164 imagePreview_el = metadata.find('imagePreview')
4165 if imagePreview_el is not None:
4166 thumbnail = imagePreview_el.text
4175 'thumbnail': thumbnail,
4176 'description': description
4180 class SpiegelIE(InfoExtractor):
4181 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4183 def _real_extract(self, url):
4184 m = re.match(self._VALID_URL, url)
4185 video_id = m.group('videoID')
4187 webpage = self._download_webpage(url, video_id)
4188 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4190 raise ExtractorError(u'Cannot find title')
4191 video_title = unescapeHTML(m.group(1))
4193 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4194 xml_code = self._download_webpage(xml_url, video_id,
4195 note=u'Downloading XML', errnote=u'Failed to download XML')
4197 idoc = xml.etree.ElementTree.fromstring(xml_code)
4198 last_type = idoc[-1]
4199 filename = last_type.findall('./filename')[0].text
4200 duration = float(last_type.findall('./duration')[0].text)
4202 video_url = 'http://video2.spiegel.de/flash/' + filename
4203 video_ext = filename.rpartition('.')[2]
4208 'title': video_title,
4209 'duration': duration,
4213 class LiveLeakIE(InfoExtractor):
4215 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4216 IE_NAME = u'liveleak'
4218 def _real_extract(self, url):
4219 mobj = re.match(self._VALID_URL, url)
4221 self._downloader.report_error(u'invalid URL: %s' % url)
4224 video_id = mobj.group('video_id')
4226 webpage = self._download_webpage(url, video_id)
4228 m = re.search(r'file: "(.*?)",', webpage)
4230 self._downloader.report_error(u'unable to find video url')
4232 video_url = m.group(1)
4234 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4236 self._downloader.report_error(u'Cannot find video title')
4237 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4239 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4241 desc = unescapeHTML(m.group('desc'))
4245 m = re.search(r'By:.*?(\w+)</a>', webpage)
4247 uploader = clean_html(m.group(1))
4256 'description': desc,
4257 'uploader': uploader
4262 class ARDIE(InfoExtractor):
4263 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4264 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4265 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4267 def _real_extract(self, url):
4268 # determine video id from url
4269 m = re.match(self._VALID_URL, url)
4271 numid = re.search(r'documentId=([0-9]+)', url)
4273 video_id = numid.group(1)
4275 video_id = m.group('video_id')
4277 # determine title and media streams from webpage
4278 html = self._download_webpage(url, video_id)
4279 title = re.search(self._TITLE, html).group('title')
4280 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4282 assert '"fsk"' in html
4283 self._downloader.report_error(u'this video is only available after 8:00 pm')
4286 # choose default media type and highest quality for now
4287 stream = max([s for s in streams if int(s["media_type"]) == 0],
4288 key=lambda s: int(s["quality"]))
4290 # there's two possibilities: RTMP stream or HTTP download
4291 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4292 if stream['rtmp_url']:
4293 self.to_screen(u'RTMP download detected')
4294 assert stream['video_url'].startswith('mp4:')
4295 info["url"] = stream["rtmp_url"]
4296 info["play_path"] = stream['video_url']
4298 assert stream["video_url"].endswith('.mp4')
4299 info["url"] = stream["video_url"]
4302 class TumblrIE(InfoExtractor):
4303 _VALID_URL = r'http://(?P<blog_name>.*?).tumblr.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4305 def _real_extract(self, url):
4306 m_url = re.match(self._VALID_URL, url)
4307 video_id = m_url.group('id')
4308 blog = m_url.group('blog_name')
4310 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4311 webpage = self._download_webpage(url, video_id)
4313 re_video = r'src=\\x22(?P<video_url>http://%s.tumblr.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4314 video = re.search(re_video, webpage)
4316 self.to_screen("No video founded")
4318 video_url = video.group('video_url')
4319 ext = video.group('ext')
4321 re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22' # We pick the first poster
4322 thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4324 # The only place where you can get a title, it's not complete,
4325 # but searching in other places doesn't work for all videos
4326 re_title = r'<title>(.*?) - (?P<title>.*?)</title>'
4327 title = unescapeHTML(re.search(re_title, webpage).group('title'))
4329 return [{'id': video_id,
4337 def gen_extractors():
4338 """ Return a list of an instance of every supported extractor.
4339 The order does matter; the first extractor matched is the one handling the URL.
4342 YoutubePlaylistIE(),
4367 StanfordOpenClassroomIE(),
4377 WorldStarHipHopIE(),
4394 def get_info_extractor(ie_name):
4395 """Returns the info extractor class with the given ie_name"""
4396 return globals()[ie_name+'IE']