2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 self.report_download_webpage(video_id)
118 elif note is not False:
119 self.to_screen(u'%s: %s' % (video_id, note))
121 return compat_urllib_request.urlopen(url_or_request)
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
124 errnote = u'Unable to download webpage'
125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
127 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns the data of the page as a string """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
133 encoding = m.group(1)
136 webpage_bytes = urlh.read()
137 if self._downloader.params.get('dump_intermediate_pages', False):
139 url = url_or_request.get_full_url()
140 except AttributeError:
142 self.to_screen(u'Dumping request to ' + url)
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
145 return webpage_bytes.decode(encoding, 'replace')
147 def to_screen(self, msg):
148 """Print msg to screen, prefixing it with '[ie_name]'"""
149 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
151 def report_extraction(self, id_or_name):
152 """Report information extraction."""
153 self.to_screen(u'%s: Extracting information' % id_or_name)
155 def report_download_webpage(self, video_id):
156 """Report webpage download."""
157 self.to_screen(u'%s: Downloading webpage' % video_id)
159 def report_age_confirmation(self):
160 """Report attempt to confirm age."""
161 self.to_screen(u'Confirming age')
163 #Methods for following #608
164 #They set the correct value of the '_type' key
165 def video_result(self, video_info):
166 """Returns a video"""
167 video_info['_type'] = 'video'
169 def url_result(self, url, ie=None):
170 """Returns a url that points to a page that should be processed"""
171 #TODO: ie should be the class used for getting the info
172 video_info = {'_type': 'url',
176 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
177 """Returns a playlist"""
178 video_info = {'_type': 'playlist',
181 video_info['id'] = playlist_id
183 video_info['title'] = playlist_title
187 class YoutubeIE(InfoExtractor):
188 """Information extractor for youtube.com."""
192 (?:https?://)? # http(s):// (optional)
193 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
194 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
195 (?:.*?\#/)? # handle anchor (#/) redirect urls
196 (?: # the various things that can precede the ID:
197 (?:(?:v|embed|e)/) # v/ or embed/ or e/
198 |(?: # or the v= param in all its forms
199 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
200 (?:\?|\#!?) # the params delimiter ? or # or #!
201 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
204 )? # optional -> youtube.com/xxxx is OK
205 )? # all until now is optional -> you can pass the naked ID
206 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
207 (?(1).+)? # if we found the ID, everything can follow
209 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
210 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
211 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
212 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
213 _NETRC_MACHINE = 'youtube'
214 # Listed in order of quality
215 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
216 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
217 _video_extensions = {
223 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
229 _video_dimensions = {
248 def suitable(cls, url):
249 """Receives a URL and returns True if suitable for this IE."""
250 if YoutubePlaylistIE.suitable(url): return False
251 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
253 def report_lang(self):
254 """Report attempt to set language."""
255 self.to_screen(u'Setting language')
257 def report_login(self):
258 """Report attempt to log in."""
259 self.to_screen(u'Logging in')
261 def report_video_webpage_download(self, video_id):
262 """Report attempt to download video webpage."""
263 self.to_screen(u'%s: Downloading video webpage' % video_id)
265 def report_video_info_webpage_download(self, video_id):
266 """Report attempt to download video info webpage."""
267 self.to_screen(u'%s: Downloading video info webpage' % video_id)
269 def report_video_subtitles_download(self, video_id):
270 """Report attempt to download video info webpage."""
271 self.to_screen(u'%s: Checking available subtitles' % video_id)
273 def report_video_subtitles_request(self, video_id, sub_lang, format):
274 """Report attempt to download video info webpage."""
275 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
277 def report_video_subtitles_available(self, video_id, sub_lang_list):
278 """Report available subtitles."""
279 sub_lang = ",".join(list(sub_lang_list.keys()))
280 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
282 def report_information_extraction(self, video_id):
283 """Report attempt to extract video information."""
284 self.to_screen(u'%s: Extracting video information' % video_id)
286 def report_unavailable_format(self, video_id, format):
287 """Report extracted video URL."""
288 self.to_screen(u'%s: Format %s not available' % (video_id, format))
290 def report_rtmp_download(self):
291 """Indicate the download will use the RTMP protocol."""
292 self.to_screen(u'RTMP download detected')
294 def _get_available_subtitles(self, video_id):
295 self.report_video_subtitles_download(video_id)
296 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
298 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
299 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
300 return (u'unable to download video subtitles: %s' % compat_str(err), None)
301 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
302 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
303 if not sub_lang_list:
304 return (u'video doesn\'t have subtitles', None)
307 def _list_available_subtitles(self, video_id):
308 sub_lang_list = self._get_available_subtitles(video_id)
309 self.report_video_subtitles_available(video_id, sub_lang_list)
311 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
314 (error_message, sub_lang, sub)
316 self.report_video_subtitles_request(video_id, sub_lang, format)
317 params = compat_urllib_parse.urlencode({
323 url = 'http://www.youtube.com/api/timedtext?' + params
325 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
326 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
327 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
329 return (u'Did not fetch video subtitles', None, None)
330 return (None, sub_lang, sub)
332 def _extract_subtitle(self, video_id):
334 Return a list with a tuple:
335 [(error_message, sub_lang, sub)]
337 sub_lang_list = self._get_available_subtitles(video_id)
338 sub_format = self._downloader.params.get('subtitlesformat')
339 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
340 return [(sub_lang_list[0], None, None)]
341 if self._downloader.params.get('subtitleslang', False):
342 sub_lang = self._downloader.params.get('subtitleslang')
343 elif 'en' in sub_lang_list:
346 sub_lang = list(sub_lang_list.keys())[0]
347 if not sub_lang in sub_lang_list:
348 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
350 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
353 def _extract_all_subtitles(self, video_id):
354 sub_lang_list = self._get_available_subtitles(video_id)
355 sub_format = self._downloader.params.get('subtitlesformat')
356 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
357 return [(sub_lang_list[0], None, None)]
359 for sub_lang in sub_lang_list:
360 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
361 subtitles.append(subtitle)
364 def _print_formats(self, formats):
365 print('Available formats:')
367 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
369 def _real_initialize(self):
370 if self._downloader is None:
375 downloader_params = self._downloader.params
377 # Attempt to use provided username and password or .netrc data
378 if downloader_params.get('username', None) is not None:
379 username = downloader_params['username']
380 password = downloader_params['password']
381 elif downloader_params.get('usenetrc', False):
383 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
388 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
389 except (IOError, netrc.NetrcParseError) as err:
390 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
394 request = compat_urllib_request.Request(self._LANG_URL)
397 compat_urllib_request.urlopen(request).read()
398 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
399 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
402 # No authentication to be performed
406 request = compat_urllib_request.Request(self._LOGIN_URL)
408 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
409 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
410 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
415 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
417 galx = match.group(1)
419 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
425 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
429 u'PersistentCookie': u'yes',
431 u'bgresponse': u'js_disabled',
432 u'checkConnection': u'',
433 u'checkedDomains': u'youtube',
439 u'signIn': u'Sign in',
441 u'service': u'youtube',
445 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
447 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
448 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
449 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
452 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
453 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
454 self._downloader.report_warning(u'unable to log in: bad username or password')
456 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
457 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
463 'action_confirm': 'Confirm',
465 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
467 self.report_age_confirmation()
468 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
469 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
470 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
473 def _extract_id(self, url):
474 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
476 self._downloader.report_error(u'invalid URL: %s' % url)
478 video_id = mobj.group(2)
481 def _real_extract(self, url):
482 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
483 mobj = re.search(self._NEXT_URL_RE, url)
485 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
486 video_id = self._extract_id(url)
489 self.report_video_webpage_download(video_id)
490 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
491 request = compat_urllib_request.Request(url)
493 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
494 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
495 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
498 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
500 # Attempt to extract SWF player URL
501 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
503 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
508 self.report_video_info_webpage_download(video_id)
509 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
510 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
511 % (video_id, el_type))
512 video_info_webpage = self._download_webpage(video_info_url, video_id,
514 errnote='unable to download video info webpage')
515 video_info = compat_parse_qs(video_info_webpage)
516 if 'token' in video_info:
518 if 'token' not in video_info:
519 if 'reason' in video_info:
520 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
522 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
525 # Check for "rental" videos
526 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
527 self._downloader.report_error(u'"rental" videos not supported')
530 # Start extracting information
531 self.report_information_extraction(video_id)
534 if 'author' not in video_info:
535 self._downloader.report_error(u'unable to extract uploader name')
537 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
540 video_uploader_id = None
541 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
543 video_uploader_id = mobj.group(1)
545 self._downloader.report_warning(u'unable to extract uploader nickname')
548 if 'title' not in video_info:
549 self._downloader.report_error(u'unable to extract video title')
551 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
554 if 'thumbnail_url' not in video_info:
555 self._downloader.report_warning(u'unable to extract video thumbnail')
557 else: # don't panic if we can't find it
558 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
562 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
564 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
565 upload_date = unified_strdate(upload_date)
568 video_description = get_element_by_id("eow-description", video_webpage)
569 if video_description:
570 video_description = clean_html(video_description)
572 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
574 video_description = unescapeHTML(fd_mobj.group(1))
576 video_description = u''
579 video_subtitles = None
581 if self._downloader.params.get('writesubtitles', False):
582 video_subtitles = self._extract_subtitle(video_id)
584 (sub_error, sub_lang, sub) = video_subtitles[0]
586 self._downloader.report_error(sub_error)
588 if self._downloader.params.get('allsubtitles', False):
589 video_subtitles = self._extract_all_subtitles(video_id)
590 for video_subtitle in video_subtitles:
591 (sub_error, sub_lang, sub) = video_subtitle
593 self._downloader.report_error(sub_error)
595 if self._downloader.params.get('listsubtitles', False):
596 sub_lang_list = self._list_available_subtitles(video_id)
599 if 'length_seconds' not in video_info:
600 self._downloader.report_warning(u'unable to extract video duration')
603 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
606 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
608 # Decide which formats to download
609 req_format = self._downloader.params.get('format', None)
611 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
612 self.report_rtmp_download()
613 video_url_list = [(None, video_info['conn'][0])]
614 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
615 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
616 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
617 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
618 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
620 format_limit = self._downloader.params.get('format_limit', None)
621 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
622 if format_limit is not None and format_limit in available_formats:
623 format_list = available_formats[available_formats.index(format_limit):]
625 format_list = available_formats
626 existing_formats = [x for x in format_list if x in url_map]
627 if len(existing_formats) == 0:
628 raise ExtractorError(u'no known formats available for video')
629 if self._downloader.params.get('listformats', None):
630 self._print_formats(existing_formats)
632 if req_format is None or req_format == 'best':
633 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
634 elif req_format == 'worst':
635 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
636 elif req_format in ('-1', 'all'):
637 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
639 # Specific formats. We pick the first in a slash-delimeted sequence.
640 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
641 req_formats = req_format.split('/')
642 video_url_list = None
643 for rf in req_formats:
645 video_url_list = [(rf, url_map[rf])]
647 if video_url_list is None:
648 raise ExtractorError(u'requested format not available')
650 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
653 for format_param, video_real_url in video_url_list:
655 video_extension = self._video_extensions.get(format_param, 'flv')
657 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
658 self._video_dimensions.get(format_param, '???'))
662 'url': video_real_url,
663 'uploader': video_uploader,
664 'uploader_id': video_uploader_id,
665 'upload_date': upload_date,
666 'title': video_title,
667 'ext': video_extension,
668 'format': video_format,
669 'thumbnail': video_thumbnail,
670 'description': video_description,
671 'player_url': player_url,
672 'subtitles': video_subtitles,
673 'duration': video_duration
678 class MetacafeIE(InfoExtractor):
679 """Information Extractor for metacafe.com."""
681 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
682 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
683 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
684 IE_NAME = u'metacafe'
686 def report_disclaimer(self):
687 """Report disclaimer retrieval."""
688 self.to_screen(u'Retrieving disclaimer')
690 def _real_initialize(self):
691 # Retrieve disclaimer
692 request = compat_urllib_request.Request(self._DISCLAIMER)
694 self.report_disclaimer()
695 disclaimer = compat_urllib_request.urlopen(request).read()
696 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
697 self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
703 'submit': "Continue - I'm over 18",
705 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
707 self.report_age_confirmation()
708 disclaimer = compat_urllib_request.urlopen(request).read()
709 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
710 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
713 def _real_extract(self, url):
714 # Extract id and simplified title from URL
715 mobj = re.match(self._VALID_URL, url)
717 self._downloader.report_error(u'invalid URL: %s' % url)
720 video_id = mobj.group(1)
722 # Check if video comes from YouTube
723 mobj2 = re.match(r'^yt-(.*)$', video_id)
724 if mobj2 is not None:
725 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
727 # Retrieve video webpage to extract further information
728 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
730 # Extract URL, uploader and title from webpage
731 self.report_extraction(video_id)
732 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
734 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
735 video_extension = mediaURL[-3:]
737 # Extract gdaKey if available
738 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
742 gdaKey = mobj.group(1)
743 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
745 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
747 self._downloader.report_error(u'unable to extract media URL')
749 vardict = compat_parse_qs(mobj.group(1))
750 if 'mediaData' not in vardict:
751 self._downloader.report_error(u'unable to extract media URL')
753 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
755 self._downloader.report_error(u'unable to extract media URL')
757 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
758 video_extension = mediaURL[-3:]
759 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
761 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
763 self._downloader.report_error(u'unable to extract title')
765 video_title = mobj.group(1).decode('utf-8')
767 mobj = re.search(r'submitter=(.*?);', webpage)
769 self._downloader.report_error(u'unable to extract uploader nickname')
771 video_uploader = mobj.group(1)
774 'id': video_id.decode('utf-8'),
775 'url': video_url.decode('utf-8'),
776 'uploader': video_uploader.decode('utf-8'),
778 'title': video_title,
779 'ext': video_extension.decode('utf-8'),
782 class DailymotionIE(InfoExtractor):
783 """Information Extractor for Dailymotion"""
785 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
786 IE_NAME = u'dailymotion'
788 def _real_extract(self, url):
789 # Extract id and simplified title from URL
790 mobj = re.match(self._VALID_URL, url)
792 self._downloader.report_error(u'invalid URL: %s' % url)
795 video_id = mobj.group(1).split('_')[0].split('?')[0]
797 video_extension = 'mp4'
799 # Retrieve video webpage to extract further information
800 request = compat_urllib_request.Request(url)
801 request.add_header('Cookie', 'family_filter=off')
802 webpage = self._download_webpage(request, video_id)
804 # Extract URL, uploader and title from webpage
805 self.report_extraction(video_id)
806 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
808 self._downloader.report_error(u'unable to extract media URL')
810 flashvars = compat_urllib_parse.unquote(mobj.group(1))
812 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
815 self.to_screen(u'Using %s' % key)
818 self._downloader.report_error(u'unable to extract video URL')
821 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
823 self._downloader.report_error(u'unable to extract video URL')
826 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
828 # TODO: support choosing qualities
830 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
832 self._downloader.report_error(u'unable to extract title')
834 video_title = unescapeHTML(mobj.group('title'))
836 video_uploader = None
837 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
839 # lookin for official user
840 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
841 if mobj_official is None:
842 self._downloader.report_warning(u'unable to extract uploader nickname')
844 video_uploader = mobj_official.group(1)
846 video_uploader = mobj.group(1)
848 video_upload_date = None
849 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
851 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
856 'uploader': video_uploader,
857 'upload_date': video_upload_date,
858 'title': video_title,
859 'ext': video_extension,
863 class PhotobucketIE(InfoExtractor):
864 """Information extractor for photobucket.com."""
866 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
867 IE_NAME = u'photobucket'
869 def _real_extract(self, url):
870 # Extract id from URL
871 mobj = re.match(self._VALID_URL, url)
873 self._downloader.report_error(u'Invalid URL: %s' % url)
876 video_id = mobj.group(1)
878 video_extension = 'flv'
880 # Retrieve video webpage to extract further information
881 request = compat_urllib_request.Request(url)
883 self.report_download_webpage(video_id)
884 webpage = compat_urllib_request.urlopen(request).read()
885 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
886 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
889 # Extract URL, uploader, and title from webpage
890 self.report_extraction(video_id)
891 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
893 self._downloader.report_error(u'unable to extract media URL')
895 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
899 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
901 self._downloader.report_error(u'unable to extract title')
903 video_title = mobj.group(1).decode('utf-8')
905 video_uploader = mobj.group(2).decode('utf-8')
908 'id': video_id.decode('utf-8'),
909 'url': video_url.decode('utf-8'),
910 'uploader': video_uploader,
912 'title': video_title,
913 'ext': video_extension.decode('utf-8'),
917 class YahooIE(InfoExtractor):
918 """Information extractor for video.yahoo.com."""
921 # _VALID_URL matches all Yahoo! Video URLs
922 # _VPAGE_URL matches only the extractable '/watch/' URLs
923 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
924 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
925 IE_NAME = u'video.yahoo'
927 def _real_extract(self, url, new_video=True):
928 # Extract ID from URL
929 mobj = re.match(self._VALID_URL, url)
931 self._downloader.report_error(u'Invalid URL: %s' % url)
934 video_id = mobj.group(2)
935 video_extension = 'flv'
937 # Rewrite valid but non-extractable URLs as
938 # extractable English language /watch/ URLs
939 if re.match(self._VPAGE_URL, url) is None:
940 request = compat_urllib_request.Request(url)
942 webpage = compat_urllib_request.urlopen(request).read()
943 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
944 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
947 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
949 self._downloader.report_error(u'Unable to extract id field')
951 yahoo_id = mobj.group(1)
953 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
955 self._downloader.report_error(u'Unable to extract vid field')
957 yahoo_vid = mobj.group(1)
959 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
960 return self._real_extract(url, new_video=False)
962 # Retrieve video webpage to extract further information
963 request = compat_urllib_request.Request(url)
965 self.report_download_webpage(video_id)
966 webpage = compat_urllib_request.urlopen(request).read()
967 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
968 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
971 # Extract uploader and title from webpage
972 self.report_extraction(video_id)
973 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
975 self._downloader.report_error(u'unable to extract video title')
977 video_title = mobj.group(1).decode('utf-8')
979 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
981 self._downloader.report_error(u'unable to extract video uploader')
983 video_uploader = mobj.group(1).decode('utf-8')
985 # Extract video thumbnail
986 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
988 self._downloader.report_error(u'unable to extract video thumbnail')
990 video_thumbnail = mobj.group(1).decode('utf-8')
992 # Extract video description
993 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
995 self._downloader.report_error(u'unable to extract video description')
997 video_description = mobj.group(1).decode('utf-8')
998 if not video_description:
999 video_description = 'No description available.'
1001 # Extract video height and width
1002 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1004 self._downloader.report_error(u'unable to extract video height')
1006 yv_video_height = mobj.group(1)
1008 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1010 self._downloader.report_error(u'unable to extract video width')
1012 yv_video_width = mobj.group(1)
1014 # Retrieve video playlist to extract media URL
1015 # I'm not completely sure what all these options are, but we
1016 # seem to need most of them, otherwise the server sends a 401.
1017 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1018 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1019 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1020 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1021 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1023 self.report_download_webpage(video_id)
1024 webpage = compat_urllib_request.urlopen(request).read()
1025 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1026 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1029 # Extract media URL from playlist XML
1030 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1032 self._downloader.report_error(u'Unable to extract media URL')
1034 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1035 video_url = unescapeHTML(video_url)
1038 'id': video_id.decode('utf-8'),
1040 'uploader': video_uploader,
1041 'upload_date': None,
1042 'title': video_title,
1043 'ext': video_extension.decode('utf-8'),
1044 'thumbnail': video_thumbnail.decode('utf-8'),
1045 'description': video_description,
1049 class VimeoIE(InfoExtractor):
1050 """Information extractor for vimeo.com."""
1052 # _VALID_URL matches Vimeo URLs
1053 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1056 def _real_extract(self, url, new_video=True):
1057 # Extract ID from URL
1058 mobj = re.match(self._VALID_URL, url)
1060 self._downloader.report_error(u'Invalid URL: %s' % url)
1063 video_id = mobj.group('id')
1064 if not mobj.group('proto'):
1065 url = 'https://' + url
1066 if mobj.group('direct_link'):
1067 url = 'https://vimeo.com/' + video_id
1069 # Retrieve video webpage to extract further information
1070 request = compat_urllib_request.Request(url, None, std_headers)
1071 webpage = self._download_webpage(request, video_id)
1073 # Now we begin extracting as much information as we can from what we
1074 # retrieved. First we extract the information common to all extractors,
1075 # and latter we extract those that are Vimeo specific.
1076 self.report_extraction(video_id)
1078 # Extract the config JSON
1080 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1081 config = json.loads(config)
1083 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
1084 self._downloader.report_error(u'The author has restricted the access to this video, try with the "--referer" option')
1086 self._downloader.report_error(u'unable to extract info section')
1090 video_title = config["video"]["title"]
1092 # Extract uploader and uploader_id
1093 video_uploader = config["video"]["owner"]["name"]
1094 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1096 # Extract video thumbnail
1097 video_thumbnail = config["video"]["thumbnail"]
1099 # Extract video description
1100 video_description = get_element_by_attribute("itemprop", "description", webpage)
1101 if video_description: video_description = clean_html(video_description)
1102 else: video_description = u''
1104 # Extract upload date
1105 video_upload_date = None
1106 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1107 if mobj is not None:
1108 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1110 # Vimeo specific: extract request signature and timestamp
1111 sig = config['request']['signature']
1112 timestamp = config['request']['timestamp']
1114 # Vimeo specific: extract video codec and quality information
1115 # First consider quality, then codecs, then take everything
1116 # TODO bind to format param
1117 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1118 files = { 'hd': [], 'sd': [], 'other': []}
1119 for codec_name, codec_extension in codecs:
1120 if codec_name in config["video"]["files"]:
1121 if 'hd' in config["video"]["files"][codec_name]:
1122 files['hd'].append((codec_name, codec_extension, 'hd'))
1123 elif 'sd' in config["video"]["files"][codec_name]:
1124 files['sd'].append((codec_name, codec_extension, 'sd'))
1126 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1128 for quality in ('hd', 'sd', 'other'):
1129 if len(files[quality]) > 0:
1130 video_quality = files[quality][0][2]
1131 video_codec = files[quality][0][0]
1132 video_extension = files[quality][0][1]
1133 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1136 self._downloader.report_error(u'no known codec found')
1139 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1140 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1145 'uploader': video_uploader,
1146 'uploader_id': video_uploader_id,
1147 'upload_date': video_upload_date,
1148 'title': video_title,
1149 'ext': video_extension,
1150 'thumbnail': video_thumbnail,
1151 'description': video_description,
1155 class ArteTvIE(InfoExtractor):
1156 """arte.tv information extractor."""
1158 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1159 _LIVE_URL = r'index-[0-9]+\.html$'
1161 IE_NAME = u'arte.tv'
1163 def fetch_webpage(self, url):
1164 request = compat_urllib_request.Request(url)
1166 self.report_download_webpage(url)
1167 webpage = compat_urllib_request.urlopen(request).read()
1168 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1169 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1171 except ValueError as err:
1172 self._downloader.report_error(u'Invalid URL: %s' % url)
1176 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1177 page = self.fetch_webpage(url)
1178 mobj = re.search(regex, page, regexFlags)
1182 self._downloader.report_error(u'Invalid URL: %s' % url)
1185 for (i, key, err) in matchTuples:
1186 if mobj.group(i) is None:
1187 self._downloader.report_error(err)
1190 info[key] = mobj.group(i)
1194 def extractLiveStream(self, url):
1195 video_lang = url.split('/')[-4]
1196 info = self.grep_webpage(
1198 r'src="(.*?/videothek_js.*?\.js)',
1201 (1, 'url', u'Invalid URL: %s' % url)
1204 http_host = url.split('/')[2]
1205 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1206 info = self.grep_webpage(
1208 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1209 '(http://.*?\.swf).*?' +
1213 (1, 'path', u'could not extract video path: %s' % url),
1214 (2, 'player', u'could not extract video player: %s' % url),
1215 (3, 'url', u'could not extract video url: %s' % url)
1218 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1220 def extractPlus7Stream(self, url):
1221 video_lang = url.split('/')[-3]
1222 info = self.grep_webpage(
1224 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1227 (1, 'url', u'Invalid URL: %s' % url)
1230 next_url = compat_urllib_parse.unquote(info.get('url'))
1231 info = self.grep_webpage(
1233 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1236 (1, 'url', u'Could not find <video> tag: %s' % url)
1239 next_url = compat_urllib_parse.unquote(info.get('url'))
1241 info = self.grep_webpage(
1243 r'<video id="(.*?)".*?>.*?' +
1244 '<name>(.*?)</name>.*?' +
1245 '<dateVideo>(.*?)</dateVideo>.*?' +
1246 '<url quality="hd">(.*?)</url>',
1249 (1, 'id', u'could not extract video id: %s' % url),
1250 (2, 'title', u'could not extract video title: %s' % url),
1251 (3, 'date', u'could not extract video date: %s' % url),
1252 (4, 'url', u'could not extract video url: %s' % url)
1257 'id': info.get('id'),
1258 'url': compat_urllib_parse.unquote(info.get('url')),
1259 'uploader': u'arte.tv',
1260 'upload_date': info.get('date'),
1261 'title': info.get('title').decode('utf-8'),
1267 def _real_extract(self, url):
1268 video_id = url.split('/')[-1]
1269 self.report_extraction(video_id)
1271 if re.search(self._LIVE_URL, video_id) is not None:
1272 self.extractLiveStream(url)
1275 info = self.extractPlus7Stream(url)
1280 class GenericIE(InfoExtractor):
1281 """Generic last-resort information extractor."""
1284 IE_NAME = u'generic'
1286 def report_download_webpage(self, video_id):
1287 """Report webpage download."""
1288 if not self._downloader.params.get('test', False):
1289 self._downloader.report_warning(u'Falling back on generic information extractor.')
1290 super(GenericIE, self).report_download_webpage(video_id)
1292 def report_following_redirect(self, new_url):
1293 """Report information extraction."""
1294 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1296 def _test_redirect(self, url):
1297 """Check if it is a redirect, like url shorteners, in case return the new url."""
1298 class HeadRequest(compat_urllib_request.Request):
1299 def get_method(self):
1302 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1304 Subclass the HTTPRedirectHandler to make it use our
1305 HeadRequest also on the redirected URL
1307 def redirect_request(self, req, fp, code, msg, headers, newurl):
1308 if code in (301, 302, 303, 307):
1309 newurl = newurl.replace(' ', '%20')
1310 newheaders = dict((k,v) for k,v in req.headers.items()
1311 if k.lower() not in ("content-length", "content-type"))
1312 return HeadRequest(newurl,
1314 origin_req_host=req.get_origin_req_host(),
1317 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1319 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1321 Fallback to GET if HEAD is not allowed (405 HTTP error)
1323 def http_error_405(self, req, fp, code, msg, headers):
1327 newheaders = dict((k,v) for k,v in req.headers.items()
1328 if k.lower() not in ("content-length", "content-type"))
1329 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1331 origin_req_host=req.get_origin_req_host(),
1335 opener = compat_urllib_request.OpenerDirector()
1336 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1337 HTTPMethodFallback, HEADRedirectHandler,
1338 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1339 opener.add_handler(handler())
1341 response = opener.open(HeadRequest(url))
1342 new_url = response.geturl()
1347 self.report_following_redirect(new_url)
1350 def _real_extract(self, url):
1351 new_url = self._test_redirect(url)
1352 if new_url: return [self.url_result(new_url)]
1354 video_id = url.split('/')[-1]
1356 webpage = self._download_webpage(url, video_id)
1357 except ValueError as err:
1358 # since this is the last-resort InfoExtractor, if
1359 # this error is thrown, it'll be thrown here
1360 self._downloader.report_error(u'Invalid URL: %s' % url)
1363 self.report_extraction(video_id)
1364 # Start with something easy: JW Player in SWFObject
1365 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1367 # Broaden the search a little bit
1368 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1370 # Broaden the search a little bit: JWPlayer JS loader
1371 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1373 self._downloader.report_error(u'Invalid URL: %s' % url)
1376 # It's possible that one of the regexes
1377 # matched, but returned an empty group:
1378 if mobj.group(1) is None:
1379 self._downloader.report_error(u'Invalid URL: %s' % url)
1382 video_url = compat_urllib_parse.unquote(mobj.group(1))
1383 video_id = os.path.basename(video_url)
1385 # here's a fun little line of code for you:
1386 video_extension = os.path.splitext(video_id)[1][1:]
1387 video_id = os.path.splitext(video_id)[0]
1389 # it's tempting to parse this further, but you would
1390 # have to take into account all the variations like
1391 # Video Title - Site Name
1392 # Site Name | Video Title
1393 # Video Title - Tagline | Site Name
1394 # and so on and so forth; it's just not practical
1395 mobj = re.search(r'<title>(.*)</title>', webpage)
1397 self._downloader.report_error(u'unable to extract title')
1399 video_title = mobj.group(1)
1401 # video uploader is domain name
1402 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1404 self._downloader.report_error(u'unable to extract title')
1406 video_uploader = mobj.group(1)
1411 'uploader': video_uploader,
1412 'upload_date': None,
1413 'title': video_title,
1414 'ext': video_extension,
1418 class YoutubeSearchIE(InfoExtractor):
1419 """Information Extractor for YouTube search queries."""
1420 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1421 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1422 _max_youtube_results = 1000
1423 IE_NAME = u'youtube:search'
1425 def report_download_page(self, query, pagenum):
1426 """Report attempt to download search page with given number."""
1427 query = query.decode(preferredencoding())
1428 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1430 def _real_extract(self, query):
1431 mobj = re.match(self._VALID_URL, query)
1433 self._downloader.report_error(u'invalid search query "%s"' % query)
1436 prefix, query = query.split(':')
1438 query = query.encode('utf-8')
1440 return self._get_n_results(query, 1)
1441 elif prefix == 'all':
1442 self._get_n_results(query, self._max_youtube_results)
1447 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1449 elif n > self._max_youtube_results:
1450 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1451 n = self._max_youtube_results
1452 return self._get_n_results(query, n)
1453 except ValueError: # parsing prefix as integer fails
1454 return self._get_n_results(query, 1)
1456 def _get_n_results(self, query, n):
1457 """Get a specified number of results for a query"""
1463 while (50 * pagenum) < limit:
1464 self.report_download_page(query, pagenum+1)
1465 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1466 request = compat_urllib_request.Request(result_url)
1468 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1469 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1470 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1472 api_response = json.loads(data)['data']
1474 if not 'items' in api_response:
1475 self._downloader.report_error(u'[youtube] No video results')
1478 new_ids = list(video['id'] for video in api_response['items'])
1479 video_ids += new_ids
1481 limit = min(n, api_response['totalItems'])
1484 if len(video_ids) > n:
1485 video_ids = video_ids[:n]
1486 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1490 class GoogleSearchIE(InfoExtractor):
1491 """Information Extractor for Google Video search queries."""
1492 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1493 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1494 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1495 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1496 _max_google_results = 1000
1497 IE_NAME = u'video.google:search'
1499 def report_download_page(self, query, pagenum):
1500 """Report attempt to download playlist page with given number."""
1501 query = query.decode(preferredencoding())
1502 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1504 def _real_extract(self, query):
1505 mobj = re.match(self._VALID_URL, query)
1507 self._downloader.report_error(u'invalid search query "%s"' % query)
1510 prefix, query = query.split(':')
1512 query = query.encode('utf-8')
1514 self._download_n_results(query, 1)
1516 elif prefix == 'all':
1517 self._download_n_results(query, self._max_google_results)
1523 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1525 elif n > self._max_google_results:
1526 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1527 n = self._max_google_results
1528 self._download_n_results(query, n)
1530 except ValueError: # parsing prefix as integer fails
1531 self._download_n_results(query, 1)
1534 def _download_n_results(self, query, n):
1535 """Downloads a specified number of results for a query"""
1541 self.report_download_page(query, pagenum)
1542 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1543 request = compat_urllib_request.Request(result_url)
1545 page = compat_urllib_request.urlopen(request).read()
1546 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1547 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1550 # Extract video identifiers
1551 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1552 video_id = mobj.group(1)
1553 if video_id not in video_ids:
1554 video_ids.append(video_id)
1555 if len(video_ids) == n:
1556 # Specified n videos reached
1557 for id in video_ids:
1558 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1561 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1562 for id in video_ids:
1563 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1566 pagenum = pagenum + 1
1569 class YahooSearchIE(InfoExtractor):
1570 """Information Extractor for Yahoo! Video search queries."""
1573 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1574 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1575 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1576 _MORE_PAGES_INDICATOR = r'\s*Next'
1577 _max_yahoo_results = 1000
1578 IE_NAME = u'video.yahoo:search'
1580 def report_download_page(self, query, pagenum):
1581 """Report attempt to download playlist page with given number."""
1582 query = query.decode(preferredencoding())
1583 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1585 def _real_extract(self, query):
1586 mobj = re.match(self._VALID_URL, query)
1588 self._downloader.report_error(u'invalid search query "%s"' % query)
1591 prefix, query = query.split(':')
1593 query = query.encode('utf-8')
1595 self._download_n_results(query, 1)
1597 elif prefix == 'all':
1598 self._download_n_results(query, self._max_yahoo_results)
1604 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1606 elif n > self._max_yahoo_results:
1607 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1608 n = self._max_yahoo_results
1609 self._download_n_results(query, n)
1611 except ValueError: # parsing prefix as integer fails
1612 self._download_n_results(query, 1)
1615 def _download_n_results(self, query, n):
1616 """Downloads a specified number of results for a query"""
1619 already_seen = set()
1623 self.report_download_page(query, pagenum)
1624 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1625 request = compat_urllib_request.Request(result_url)
1627 page = compat_urllib_request.urlopen(request).read()
1628 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1629 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1632 # Extract video identifiers
1633 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1634 video_id = mobj.group(1)
1635 if video_id not in already_seen:
1636 video_ids.append(video_id)
1637 already_seen.add(video_id)
1638 if len(video_ids) == n:
1639 # Specified n videos reached
1640 for id in video_ids:
1641 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1644 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1645 for id in video_ids:
1646 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1649 pagenum = pagenum + 1
1652 class YoutubePlaylistIE(InfoExtractor):
1653 """Information Extractor for YouTube playlists."""
1655 _VALID_URL = r"""(?:
1660 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1661 \? (?:.*?&)*? (?:p|a|list)=
1664 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1667 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1669 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1671 IE_NAME = u'youtube:playlist'
1674 def suitable(cls, url):
1675 """Receives a URL and returns True if suitable for this IE."""
1676 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1678 def _real_extract(self, url):
1679 # Extract playlist id
1680 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1682 self._downloader.report_error(u'invalid url: %s' % url)
1685 # Download playlist videos from API
1686 playlist_id = mobj.group(1) or mobj.group(2)
1691 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1692 page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
1695 response = json.loads(page)
1696 except ValueError as err:
1697 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1700 if 'feed' not in response:
1701 self._downloader.report_error(u'Got a malformed response from YouTube API')
1703 playlist_title = response['feed']['title']['$t']
1704 if 'entry' not in response['feed']:
1705 # Number of videos is a multiple of self._MAX_RESULTS
1708 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1709 for entry in response['feed']['entry']
1710 if 'content' in entry ]
1712 if len(response['feed']['entry']) < self._MAX_RESULTS:
1716 videos = [v[1] for v in sorted(videos)]
1718 url_results = [self.url_result(url, 'Youtube') for url in videos]
1719 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1722 class YoutubeChannelIE(InfoExtractor):
1723 """Information Extractor for YouTube channels."""
1725 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1726 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1727 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1728 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1729 IE_NAME = u'youtube:channel'
1731 def extract_videos_from_page(self, page):
1733 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1734 if mobj.group(1) not in ids_in_page:
1735 ids_in_page.append(mobj.group(1))
1738 def _real_extract(self, url):
1739 # Extract channel id
1740 mobj = re.match(self._VALID_URL, url)
1742 self._downloader.report_error(u'invalid url: %s' % url)
1745 # Download channel page
1746 channel_id = mobj.group(1)
1750 url = self._TEMPLATE_URL % (channel_id, pagenum)
1751 page = self._download_webpage(url, channel_id,
1752 u'Downloading page #%s' % pagenum)
1754 # Extract video identifiers
1755 ids_in_page = self.extract_videos_from_page(page)
1756 video_ids.extend(ids_in_page)
1758 # Download any subsequent channel pages using the json-based channel_ajax query
1759 if self._MORE_PAGES_INDICATOR in page:
1761 pagenum = pagenum + 1
1763 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1764 page = self._download_webpage(url, channel_id,
1765 u'Downloading page #%s' % pagenum)
1767 page = json.loads(page)
1769 ids_in_page = self.extract_videos_from_page(page['content_html'])
1770 video_ids.extend(ids_in_page)
1772 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1775 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1777 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1778 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1779 return [self.playlist_result(url_entries, channel_id)]
1782 class YoutubeUserIE(InfoExtractor):
1783 """Information Extractor for YouTube users."""
1785 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1786 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1787 _GDATA_PAGE_SIZE = 50
1788 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1789 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1790 IE_NAME = u'youtube:user'
1792 def _real_extract(self, url):
1794 mobj = re.match(self._VALID_URL, url)
1796 self._downloader.report_error(u'invalid url: %s' % url)
1799 username = mobj.group(1)
1801 # Download video ids using YouTube Data API. Result size per
1802 # query is limited (currently to 50 videos) so we need to query
1803 # page by page until there are no video ids - it means we got
1810 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1812 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1813 page = self._download_webpage(gdata_url, username,
1814 u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
1816 # Extract video identifiers
1819 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1820 if mobj.group(1) not in ids_in_page:
1821 ids_in_page.append(mobj.group(1))
1823 video_ids.extend(ids_in_page)
1825 # A little optimization - if current page is not
1826 # "full", ie. does not contain PAGE_SIZE video ids then
1827 # we can assume that this page is the last one - there
1828 # are no more ids on further pages - no need to query
1831 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1836 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1837 url_results = [self.url_result(url, 'Youtube') for url in urls]
1838 return [self.playlist_result(url_results, playlist_title = username)]
1841 class BlipTVUserIE(InfoExtractor):
1842 """Information Extractor for blip.tv users."""
1844 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1846 IE_NAME = u'blip.tv:user'
1848 def _real_extract(self, url):
1850 mobj = re.match(self._VALID_URL, url)
1852 self._downloader.report_error(u'invalid url: %s' % url)
1855 username = mobj.group(1)
1857 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1859 page = self._download_webpage(url, username, u'Downloading user page')
1860 mobj = re.search(r'data-users-id="([^"]+)"', page)
1861 page_base = page_base % mobj.group(1)
1864 # Download video ids using BlipTV Ajax calls. Result size per
1865 # query is limited (currently to 12 videos) so we need to query
1866 # page by page until there are no video ids - it means we got
1873 url = page_base + "&page=" + str(pagenum)
1874 page = self._download_webpage(url, username,
1875 u'Downloading video ids from page %d' % pagenum)
1877 # Extract video identifiers
1880 for mobj in re.finditer(r'href="/([^"]+)"', page):
1881 if mobj.group(1) not in ids_in_page:
1882 ids_in_page.append(unescapeHTML(mobj.group(1)))
1884 video_ids.extend(ids_in_page)
1886 # A little optimization - if current page is not
1887 # "full", ie. does not contain PAGE_SIZE video ids then
1888 # we can assume that this page is the last one - there
1889 # are no more ids on further pages - no need to query
1892 if len(ids_in_page) < self._PAGE_SIZE:
1897 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
1898 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
1899 return [self.playlist_result(url_entries, playlist_title = username)]
1902 class DepositFilesIE(InfoExtractor):
1903 """Information extractor for depositfiles.com"""
1905 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1907 def _real_extract(self, url):
1908 file_id = url.split('/')[-1]
1909 # Rebuild url in english locale
1910 url = 'http://depositfiles.com/en/files/' + file_id
1912 # Retrieve file webpage with 'Free download' button pressed
1913 free_download_indication = { 'gateway_result' : '1' }
1914 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1916 self.report_download_webpage(file_id)
1917 webpage = compat_urllib_request.urlopen(request).read()
1918 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1919 self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
1922 # Search for the real file URL
1923 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1924 if (mobj is None) or (mobj.group(1) is None):
1925 # Try to figure out reason of the error.
1926 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1927 if (mobj is not None) and (mobj.group(1) is not None):
1928 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1929 self._downloader.report_error(u'%s' % restriction_message)
1931 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
1934 file_url = mobj.group(1)
1935 file_extension = os.path.splitext(file_url)[1][1:]
1937 # Search for file title
1938 mobj = re.search(r'<b title="(.*?)">', webpage)
1940 self._downloader.report_error(u'unable to extract title')
1942 file_title = mobj.group(1).decode('utf-8')
1945 'id': file_id.decode('utf-8'),
1946 'url': file_url.decode('utf-8'),
1948 'upload_date': None,
1949 'title': file_title,
1950 'ext': file_extension.decode('utf-8'),
1954 class FacebookIE(InfoExtractor):
1955 """Information Extractor for Facebook"""
1957 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1958 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1959 _NETRC_MACHINE = 'facebook'
1960 IE_NAME = u'facebook'
1962 def report_login(self):
1963 """Report attempt to log in."""
1964 self.to_screen(u'Logging in')
1966 def _real_initialize(self):
1967 if self._downloader is None:
1972 downloader_params = self._downloader.params
1974 # Attempt to use provided username and password or .netrc data
1975 if downloader_params.get('username', None) is not None:
1976 useremail = downloader_params['username']
1977 password = downloader_params['password']
1978 elif downloader_params.get('usenetrc', False):
1980 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
1981 if info is not None:
1985 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
1986 except (IOError, netrc.NetrcParseError) as err:
1987 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
1990 if useremail is None:
1999 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2002 login_results = compat_urllib_request.urlopen(request).read()
2003 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2004 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2006 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2007 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2010 def _real_extract(self, url):
2011 mobj = re.match(self._VALID_URL, url)
2013 self._downloader.report_error(u'invalid URL: %s' % url)
2015 video_id = mobj.group('ID')
2017 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2018 webpage = self._download_webpage(url, video_id)
2020 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2021 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2022 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2024 raise ExtractorError(u'Cannot parse data')
2025 data = dict(json.loads(m.group(1)))
2026 params_raw = compat_urllib_parse.unquote(data['params'])
2027 params = json.loads(params_raw)
2028 video_data = params['video_data'][0]
2029 video_url = video_data.get('hd_src')
2031 video_url = video_data['sd_src']
2033 raise ExtractorError(u'Cannot find video URL')
2034 video_duration = int(video_data['video_duration'])
2035 thumbnail = video_data['thumbnail_src']
2037 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2039 raise ExtractorError(u'Cannot find title in webpage')
2040 video_title = unescapeHTML(m.group(1))
2044 'title': video_title,
2047 'duration': video_duration,
2048 'thumbnail': thumbnail,
2053 class BlipTVIE(InfoExtractor):
2054 """Information extractor for blip.tv"""
2056 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2057 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2058 IE_NAME = u'blip.tv'
2060 def report_direct_download(self, title):
2061 """Report information extraction."""
2062 self.to_screen(u'%s: Direct download detected' % title)
2064 def _real_extract(self, url):
2065 mobj = re.match(self._VALID_URL, url)
2067 self._downloader.report_error(u'invalid URL: %s' % url)
2070 urlp = compat_urllib_parse_urlparse(url)
2071 if urlp.path.startswith('/play/'):
2072 request = compat_urllib_request.Request(url)
2073 response = compat_urllib_request.urlopen(request)
2074 redirecturl = response.geturl()
2075 rurlp = compat_urllib_parse_urlparse(redirecturl)
2076 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2077 url = 'http://blip.tv/a/a-' + file_id
2078 return self._real_extract(url)
2085 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2086 request = compat_urllib_request.Request(json_url)
2087 request.add_header('User-Agent', 'iTunes/10.6.1')
2088 self.report_extraction(mobj.group(1))
2091 urlh = compat_urllib_request.urlopen(request)
2092 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2093 basename = url.split('/')[-1]
2094 title,ext = os.path.splitext(basename)
2095 title = title.decode('UTF-8')
2096 ext = ext.replace('.', '')
2097 self.report_direct_download(title)
2102 'upload_date': None,
2107 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2108 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2109 if info is None: # Regular URL
2111 json_code_bytes = urlh.read()
2112 json_code = json_code_bytes.decode('utf-8')
2113 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2114 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2118 json_data = json.loads(json_code)
2119 if 'Post' in json_data:
2120 data = json_data['Post']
2124 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2125 video_url = data['media']['url']
2126 umobj = re.match(self._URL_EXT, video_url)
2128 raise ValueError('Can not determine filename extension')
2129 ext = umobj.group(1)
2132 'id': data['item_id'],
2134 'uploader': data['display_name'],
2135 'upload_date': upload_date,
2136 'title': data['title'],
2138 'format': data['media']['mimeType'],
2139 'thumbnail': data['thumbnailUrl'],
2140 'description': data['description'],
2141 'player_url': data['embedUrl'],
2142 'user_agent': 'iTunes/10.6.1',
2144 except (ValueError,KeyError) as err:
2145 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2151 class MyVideoIE(InfoExtractor):
2152 """Information Extractor for myvideo.de."""
2154 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2155 IE_NAME = u'myvideo'
2157 def _real_extract(self,url):
2158 mobj = re.match(self._VALID_URL, url)
2160 self._download.report_error(u'invalid URL: %s' % url)
2163 video_id = mobj.group(1)
2166 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2167 webpage = self._download_webpage(webpage_url, video_id)
2169 self.report_extraction(video_id)
2170 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2173 self._downloader.report_error(u'unable to extract media URL')
2175 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2177 mobj = re.search('<title>([^<]+)</title>', webpage)
2179 self._downloader.report_error(u'unable to extract title')
2182 video_title = mobj.group(1)
2188 'upload_date': None,
2189 'title': video_title,
2193 class ComedyCentralIE(InfoExtractor):
2194 """Information extractor for The Daily Show and Colbert Report """
2196 # urls can be abbreviations like :thedailyshow or :colbert
2197 # urls for episodes like:
2198 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2199 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2200 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2201 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2202 |(https?://)?(www\.)?
2203 (?P<showname>thedailyshow|colbertnation)\.com/
2204 (full-episodes/(?P<episode>.*)|
2206 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2207 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2210 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2212 _video_extensions = {
2220 _video_dimensions = {
2230 def suitable(cls, url):
2231 """Receives a URL and returns True if suitable for this IE."""
2232 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2234 def _print_formats(self, formats):
2235 print('Available formats:')
2237 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2240 def _real_extract(self, url):
2241 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2243 self._downloader.report_error(u'invalid URL: %s' % url)
2246 if mobj.group('shortname'):
2247 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2248 url = u'http://www.thedailyshow.com/full-episodes/'
2250 url = u'http://www.colbertnation.com/full-episodes/'
2251 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2252 assert mobj is not None
2254 if mobj.group('clip'):
2255 if mobj.group('showname') == 'thedailyshow':
2256 epTitle = mobj.group('tdstitle')
2258 epTitle = mobj.group('cntitle')
2261 dlNewest = not mobj.group('episode')
2263 epTitle = mobj.group('showname')
2265 epTitle = mobj.group('episode')
2267 self.report_extraction(epTitle)
2268 webpage = self._download_webpage(url, epTitle)
2270 url = htmlHandle.geturl()
2271 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2273 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2275 if mobj.group('episode') == '':
2276 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2278 epTitle = mobj.group('episode')
2280 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2282 if len(mMovieParams) == 0:
2283 # The Colbert Report embeds the information in a without
2284 # a URL prefix; so extract the alternate reference
2285 # and then add the URL prefix manually.
2287 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2288 if len(altMovieParams) == 0:
2289 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2292 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2294 uri = mMovieParams[0][1]
2295 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2296 indexXml = self._download_webpage(indexUrl, epTitle,
2297 u'Downloading show index',
2298 u'unable to download episode index')
2302 idoc = xml.etree.ElementTree.fromstring(indexXml)
2303 itemEls = idoc.findall('.//item')
2304 for partNum,itemEl in enumerate(itemEls):
2305 mediaId = itemEl.findall('./guid')[0].text
2306 shortMediaId = mediaId.split(':')[-1]
2307 showId = mediaId.split(':')[-2].replace('.com', '')
2308 officialTitle = itemEl.findall('./title')[0].text
2309 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
2311 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2312 compat_urllib_parse.urlencode({'uri': mediaId}))
2313 configXml = self._download_webpage(configUrl, epTitle,
2314 u'Downloading configuration for %s' % shortMediaId)
2316 cdoc = xml.etree.ElementTree.fromstring(configXml)
2318 for rendition in cdoc.findall('.//rendition'):
2319 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2323 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2326 if self._downloader.params.get('listformats', None):
2327 self._print_formats([i[0] for i in turls])
2330 # For now, just pick the highest bitrate
2331 format,rtmp_video_url = turls[-1]
2333 # Get the format arg from the arg stream
2334 req_format = self._downloader.params.get('format', None)
2336 # Select format if we can find one
2339 format, rtmp_video_url = f, v
2342 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2344 raise ExtractorError(u'Cannot transform RTMP url')
2345 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2346 video_url = base + m.group('finalid')
2348 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2353 'upload_date': officialDate,
2358 'description': officialTitle,
2360 results.append(info)
2365 class EscapistIE(InfoExtractor):
2366 """Information extractor for The Escapist """
2368 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2369 IE_NAME = u'escapist'
2371 def _real_extract(self, url):
2372 mobj = re.match(self._VALID_URL, url)
2374 self._downloader.report_error(u'invalid URL: %s' % url)
2376 showName = mobj.group('showname')
2377 videoId = mobj.group('episode')
2379 self.report_extraction(showName)
2380 webPage = self._download_webpage(url, showName)
2382 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2383 description = unescapeHTML(descMatch.group(1))
2384 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2385 imgUrl = unescapeHTML(imgMatch.group(1))
2386 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2387 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2388 configUrlMatch = re.search('config=(.*)$', playerUrl)
2389 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2391 configJSON = self._download_webpage(configUrl, showName,
2392 u'Downloading configuration',
2393 u'unable to download configuration')
2395 # Technically, it's JavaScript, not JSON
2396 configJSON = configJSON.replace("'", '"')
2399 config = json.loads(configJSON)
2400 except (ValueError,) as err:
2401 self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2404 playlist = config['playlist']
2405 videoUrl = playlist[1]['url']
2410 'uploader': showName,
2411 'upload_date': None,
2414 'thumbnail': imgUrl,
2415 'description': description,
2416 'player_url': playerUrl,
2421 class CollegeHumorIE(InfoExtractor):
2422 """Information extractor for collegehumor.com"""
2425 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2426 IE_NAME = u'collegehumor'
2428 def report_manifest(self, video_id):
2429 """Report information extraction."""
2430 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2432 def _real_extract(self, url):
2433 mobj = re.match(self._VALID_URL, url)
2435 self._downloader.report_error(u'invalid URL: %s' % url)
2437 video_id = mobj.group('videoid')
2442 'upload_date': None,
2445 self.report_extraction(video_id)
2446 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2448 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2449 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2450 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2453 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2455 videoNode = mdoc.findall('./video')[0]
2456 info['description'] = videoNode.findall('./description')[0].text
2457 info['title'] = videoNode.findall('./caption')[0].text
2458 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2459 manifest_url = videoNode.findall('./file')[0].text
2461 self._downloader.report_error(u'Invalid metadata XML file')
2464 manifest_url += '?hdcore=2.10.3'
2465 self.report_manifest(video_id)
2467 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2468 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2469 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2472 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2474 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2475 node_id = media_node.attrib['url']
2476 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2477 except IndexError as err:
2478 self._downloader.report_error(u'Invalid manifest file')
2481 url_pr = compat_urllib_parse_urlparse(manifest_url)
2482 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2489 class XVideosIE(InfoExtractor):
2490 """Information extractor for xvideos.com"""
2492 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2493 IE_NAME = u'xvideos'
2495 def _real_extract(self, url):
2496 mobj = re.match(self._VALID_URL, url)
2498 self._downloader.report_error(u'invalid URL: %s' % url)
2500 video_id = mobj.group(1)
2502 webpage = self._download_webpage(url, video_id)
2504 self.report_extraction(video_id)
2508 mobj = re.search(r'flv_url=(.+?)&', webpage)
2510 self._downloader.report_error(u'unable to extract video url')
2512 video_url = compat_urllib_parse.unquote(mobj.group(1))
2516 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2518 self._downloader.report_error(u'unable to extract video title')
2520 video_title = mobj.group(1)
2523 # Extract video thumbnail
2524 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2526 self._downloader.report_error(u'unable to extract video thumbnail')
2528 video_thumbnail = mobj.group(0)
2534 'upload_date': None,
2535 'title': video_title,
2537 'thumbnail': video_thumbnail,
2538 'description': None,
2544 class SoundcloudIE(InfoExtractor):
2545 """Information extractor for soundcloud.com
2546 To access the media, the uid of the song and a stream token
2547 must be extracted from the page source and the script must make
2548 a request to media.soundcloud.com/crossdomain.xml. Then
2549 the media can be grabbed by requesting from an url composed
2550 of the stream token and uid
2553 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2554 IE_NAME = u'soundcloud'
2556 def report_resolve(self, video_id):
2557 """Report information extraction."""
2558 self.to_screen(u'%s: Resolving id' % video_id)
2560 def _real_extract(self, url):
2561 mobj = re.match(self._VALID_URL, url)
2563 self._downloader.report_error(u'invalid URL: %s' % url)
2566 # extract uploader (which is in the url)
2567 uploader = mobj.group(1)
2568 # extract simple title (uploader + slug of song title)
2569 slug_title = mobj.group(2)
2570 simple_title = uploader + u'-' + slug_title
2571 full_title = '%s/%s' % (uploader, slug_title)
2573 self.report_resolve(full_title)
2575 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2576 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2577 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
2579 info = json.loads(info_json)
2580 video_id = info['id']
2581 self.report_extraction(full_title)
2583 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2584 stream_json = self._download_webpage(streams_url, full_title,
2585 u'Downloading stream definitions',
2586 u'unable to download stream definitions')
2588 streams = json.loads(stream_json)
2589 mediaURL = streams['http_mp3_128_url']
2590 upload_date = unified_strdate(info['created_at'])
2595 'uploader': info['user']['username'],
2596 'upload_date': upload_date,
2597 'title': info['title'],
2599 'description': info['description'],
2602 class SoundcloudSetIE(InfoExtractor):
2603 """Information extractor for soundcloud.com sets
2604 To access the media, the uid of the song and a stream token
2605 must be extracted from the page source and the script must make
2606 a request to media.soundcloud.com/crossdomain.xml. Then
2607 the media can be grabbed by requesting from an url composed
2608 of the stream token and uid
2611 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2612 IE_NAME = u'soundcloud:set'
2614 def report_resolve(self, video_id):
2615 """Report information extraction."""
2616 self.to_screen(u'%s: Resolving id' % video_id)
2618 def _real_extract(self, url):
2619 mobj = re.match(self._VALID_URL, url)
2621 self._downloader.report_error(u'invalid URL: %s' % url)
2624 # extract uploader (which is in the url)
2625 uploader = mobj.group(1)
2626 # extract simple title (uploader + slug of song title)
2627 slug_title = mobj.group(2)
2628 simple_title = uploader + u'-' + slug_title
2629 full_title = '%s/sets/%s' % (uploader, slug_title)
2631 self.report_resolve(full_title)
2633 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2634 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2635 info_json = self._download_webpage(resolv_url, full_title)
2638 info = json.loads(info_json)
2639 if 'errors' in info:
2640 for err in info['errors']:
2641 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2644 self.report_extraction(full_title)
2645 for track in info['tracks']:
2646 video_id = track['id']
2648 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2649 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
2651 self.report_extraction(video_id)
2652 streams = json.loads(stream_json)
2653 mediaURL = streams['http_mp3_128_url']
2658 'uploader': track['user']['username'],
2659 'upload_date': unified_strdate(track['created_at']),
2660 'title': track['title'],
2662 'description': track['description'],
2667 class InfoQIE(InfoExtractor):
2668 """Information extractor for infoq.com"""
2669 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2671 def _real_extract(self, url):
2672 mobj = re.match(self._VALID_URL, url)
2674 self._downloader.report_error(u'invalid URL: %s' % url)
2677 webpage = self._download_webpage(url, video_id=url)
2678 self.report_extraction(url)
2681 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
2683 self._downloader.report_error(u'unable to extract video url')
2685 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2686 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2689 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2691 self._downloader.report_error(u'unable to extract video title')
2693 video_title = mobj.group(1)
2695 # Extract description
2696 video_description = u'No description available.'
2697 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2698 if mobj is not None:
2699 video_description = mobj.group(1)
2701 video_filename = video_url.split('/')[-1]
2702 video_id, extension = video_filename.split('.')
2708 'upload_date': None,
2709 'title': video_title,
2710 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2712 'description': video_description,
2717 class MixcloudIE(InfoExtractor):
2718 """Information extractor for www.mixcloud.com"""
2720 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2721 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2722 IE_NAME = u'mixcloud'
2724 def report_download_json(self, file_id):
2725 """Report JSON download."""
2726 self.to_screen(u'Downloading json')
2728 def get_urls(self, jsonData, fmt, bitrate='best'):
2729 """Get urls from 'audio_formats' section in json"""
2732 bitrate_list = jsonData[fmt]
2733 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2734 bitrate = max(bitrate_list) # select highest
2736 url_list = jsonData[fmt][bitrate]
2737 except TypeError: # we have no bitrate info.
2738 url_list = jsonData[fmt]
2741 def check_urls(self, url_list):
2742 """Returns 1st active url from list"""
2743 for url in url_list:
2745 compat_urllib_request.urlopen(url)
2747 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2752 def _print_formats(self, formats):
2753 print('Available formats:')
2754 for fmt in formats.keys():
2755 for b in formats[fmt]:
2757 ext = formats[fmt][b][0]
2758 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2759 except TypeError: # we have no bitrate info
2760 ext = formats[fmt][0]
2761 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2764 def _real_extract(self, url):
2765 mobj = re.match(self._VALID_URL, url)
2767 self._downloader.report_error(u'invalid URL: %s' % url)
2769 # extract uploader & filename from url
2770 uploader = mobj.group(1).decode('utf-8')
2771 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2773 # construct API request
2774 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2775 # retrieve .json file with links to files
2776 request = compat_urllib_request.Request(file_url)
2778 self.report_download_json(file_url)
2779 jsonData = compat_urllib_request.urlopen(request).read()
2780 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2781 self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
2785 json_data = json.loads(jsonData)
2786 player_url = json_data['player_swf_url']
2787 formats = dict(json_data['audio_formats'])
2789 req_format = self._downloader.params.get('format', None)
2792 if self._downloader.params.get('listformats', None):
2793 self._print_formats(formats)
2796 if req_format is None or req_format == 'best':
2797 for format_param in formats.keys():
2798 url_list = self.get_urls(formats, format_param)
2800 file_url = self.check_urls(url_list)
2801 if file_url is not None:
2804 if req_format not in formats:
2805 self._downloader.report_error(u'format is not available')
2808 url_list = self.get_urls(formats, req_format)
2809 file_url = self.check_urls(url_list)
2810 format_param = req_format
2813 'id': file_id.decode('utf-8'),
2814 'url': file_url.decode('utf-8'),
2815 'uploader': uploader.decode('utf-8'),
2816 'upload_date': None,
2817 'title': json_data['name'],
2818 'ext': file_url.split('.')[-1].decode('utf-8'),
2819 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2820 'thumbnail': json_data['thumbnail_url'],
2821 'description': json_data['description'],
2822 'player_url': player_url.decode('utf-8'),
2825 class StanfordOpenClassroomIE(InfoExtractor):
2826 """Information extractor for Stanford's Open ClassRoom"""
2828 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2829 IE_NAME = u'stanfordoc'
2831 def _real_extract(self, url):
2832 mobj = re.match(self._VALID_URL, url)
2834 raise ExtractorError(u'Invalid URL: %s' % url)
2836 if mobj.group('course') and mobj.group('video'): # A specific video
2837 course = mobj.group('course')
2838 video = mobj.group('video')
2840 'id': course + '_' + video,
2842 'upload_date': None,
2845 self.report_extraction(info['id'])
2846 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2847 xmlUrl = baseUrl + video + '.xml'
2849 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2850 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2851 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2853 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2855 info['title'] = mdoc.findall('./title')[0].text
2856 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2858 self._downloader.report_error(u'Invalid metadata XML file')
2860 info['ext'] = info['url'].rpartition('.')[2]
2862 elif mobj.group('course'): # A course page
2863 course = mobj.group('course')
2868 'upload_date': None,
2871 coursepage = self._download_webpage(url, info['id'],
2872 note='Downloading course info page',
2873 errnote='Unable to download course info page')
2875 m = re.search('<h1>([^<]+)</h1>', coursepage)
2877 info['title'] = unescapeHTML(m.group(1))
2879 info['title'] = info['id']
2881 m = re.search('<description>([^<]+)</description>', coursepage)
2883 info['description'] = unescapeHTML(m.group(1))
2885 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2888 'type': 'reference',
2889 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2893 for entry in info['list']:
2894 assert entry['type'] == 'reference'
2895 results += self.extract(entry['url'])
2899 'id': 'Stanford OpenClassroom',
2902 'upload_date': None,
2905 self.report_download_webpage(info['id'])
2906 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2908 rootpage = compat_urllib_request.urlopen(rootURL).read()
2909 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2910 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
2913 info['title'] = info['id']
2915 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2918 'type': 'reference',
2919 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
2924 for entry in info['list']:
2925 assert entry['type'] == 'reference'
2926 results += self.extract(entry['url'])
2929 class MTVIE(InfoExtractor):
2930 """Information extractor for MTV.com"""
2932 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
2935 def _real_extract(self, url):
2936 mobj = re.match(self._VALID_URL, url)
2938 self._downloader.report_error(u'invalid URL: %s' % url)
2940 if not mobj.group('proto'):
2941 url = 'http://' + url
2942 video_id = mobj.group('videoid')
2944 webpage = self._download_webpage(url, video_id)
2946 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
2948 self._downloader.report_error(u'unable to extract song name')
2950 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2951 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
2953 self._downloader.report_error(u'unable to extract performer')
2955 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
2956 video_title = performer + ' - ' + song_name
2958 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
2960 self._downloader.report_error(u'unable to mtvn_uri')
2962 mtvn_uri = mobj.group(1)
2964 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
2966 self._downloader.report_error(u'unable to extract content id')
2968 content_id = mobj.group(1)
2970 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
2971 self.report_extraction(video_id)
2972 request = compat_urllib_request.Request(videogen_url)
2974 metadataXml = compat_urllib_request.urlopen(request).read()
2975 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2976 self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
2979 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2980 renditions = mdoc.findall('.//rendition')
2982 # For now, always pick the highest quality.
2983 rendition = renditions[-1]
2986 _,_,ext = rendition.attrib['type'].partition('/')
2987 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2988 video_url = rendition.find('./src').text
2990 self._downloader.report_error('Invalid rendition field.')
2996 'uploader': performer,
2997 'upload_date': None,
2998 'title': video_title,
3006 class YoukuIE(InfoExtractor):
3007 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3010 nowTime = int(time.time() * 1000)
3011 random1 = random.randint(1000,1998)
3012 random2 = random.randint(1000,9999)
3014 return "%d%d%d" %(nowTime,random1,random2)
3016 def _get_file_ID_mix_string(self, seed):
3018 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3020 for i in range(len(source)):
3021 seed = (seed * 211 + 30031 ) % 65536
3022 index = math.floor(seed / 65536 * len(source) )
3023 mixed.append(source[int(index)])
3024 source.remove(source[int(index)])
3025 #return ''.join(mixed)
3028 def _get_file_id(self, fileId, seed):
3029 mixed = self._get_file_ID_mix_string(seed)
3030 ids = fileId.split('*')
3034 realId.append(mixed[int(ch)])
3035 return ''.join(realId)
3037 def _real_extract(self, url):
3038 mobj = re.match(self._VALID_URL, url)
3040 self._downloader.report_error(u'invalid URL: %s' % url)
3042 video_id = mobj.group('ID')
3044 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3046 jsondata = self._download_webpage(info_url, video_id)
3048 self.report_extraction(video_id)
3050 config = json.loads(jsondata)
3052 video_title = config['data'][0]['title']
3053 seed = config['data'][0]['seed']
3055 format = self._downloader.params.get('format', None)
3056 supported_format = list(config['data'][0]['streamfileids'].keys())
3058 if format is None or format == 'best':
3059 if 'hd2' in supported_format:
3064 elif format == 'worst':
3072 fileid = config['data'][0]['streamfileids'][format]
3073 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3074 except (UnicodeDecodeError, ValueError, KeyError):
3075 self._downloader.report_error(u'unable to extract info section')
3079 sid = self._gen_sid()
3080 fileid = self._get_file_id(fileid, seed)
3082 #column 8,9 of fileid represent the segment number
3083 #fileid[7:9] should be changed
3084 for index, key in enumerate(keys):
3086 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3087 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3090 'id': '%s_part%02d' % (video_id, index),
3091 'url': download_url,
3093 'upload_date': None,
3094 'title': video_title,
3097 files_info.append(info)
3102 class XNXXIE(InfoExtractor):
3103 """Information extractor for xnxx.com"""
3105 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3107 VIDEO_URL_RE = r'flv_url=(.*?)&'
3108 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3109 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3111 def _real_extract(self, url):
3112 mobj = re.match(self._VALID_URL, url)
3114 self._downloader.report_error(u'invalid URL: %s' % url)
3116 video_id = mobj.group(1)
3118 # Get webpage content
3119 webpage = self._download_webpage(url, video_id)
3121 result = re.search(self.VIDEO_URL_RE, webpage)
3123 self._downloader.report_error(u'unable to extract video url')
3125 video_url = compat_urllib_parse.unquote(result.group(1))
3127 result = re.search(self.VIDEO_TITLE_RE, webpage)
3129 self._downloader.report_error(u'unable to extract video title')
3131 video_title = result.group(1)
3133 result = re.search(self.VIDEO_THUMB_RE, webpage)
3135 self._downloader.report_error(u'unable to extract video thumbnail')
3137 video_thumbnail = result.group(1)
3143 'upload_date': None,
3144 'title': video_title,
3146 'thumbnail': video_thumbnail,
3147 'description': None,
3151 class GooglePlusIE(InfoExtractor):
3152 """Information extractor for plus.google.com."""
3154 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3155 IE_NAME = u'plus.google'
3157 def report_extract_entry(self, url):
3158 """Report downloading extry"""
3159 self.to_screen(u'Downloading entry: %s' % url)
3161 def report_date(self, upload_date):
3162 """Report downloading extry"""
3163 self.to_screen(u'Entry date: %s' % upload_date)
3165 def report_uploader(self, uploader):
3166 """Report downloading extry"""
3167 self.to_screen(u'Uploader: %s' % uploader)
3169 def report_title(self, video_title):
3170 """Report downloading extry"""
3171 self.to_screen(u'Title: %s' % video_title)
3173 def report_extract_vid_page(self, video_page):
3174 """Report information extraction."""
3175 self.to_screen(u'Extracting video page: %s' % video_page)
3177 def _real_extract(self, url):
3178 # Extract id from URL
3179 mobj = re.match(self._VALID_URL, url)
3181 self._downloader.report_error(u'Invalid URL: %s' % url)
3184 post_url = mobj.group(0)
3185 video_id = mobj.group(1)
3187 video_extension = 'flv'
3189 # Step 1, Retrieve post webpage to extract further information
3190 self.report_extract_entry(post_url)
3191 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
3193 # Extract update date
3195 pattern = 'title="Timestamp">(.*?)</a>'
3196 mobj = re.search(pattern, webpage)
3198 upload_date = mobj.group(1)
3199 # Convert timestring to a format suitable for filename
3200 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3201 upload_date = upload_date.strftime('%Y%m%d')
3202 self.report_date(upload_date)
3206 pattern = r'rel\="author".*?>(.*?)</a>'
3207 mobj = re.search(pattern, webpage)
3209 uploader = mobj.group(1)
3210 self.report_uploader(uploader)
3213 # Get the first line for title
3215 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3216 mobj = re.search(pattern, webpage)
3218 video_title = mobj.group(1)
3219 self.report_title(video_title)
3221 # Step 2, Stimulate clicking the image box to launch video
3222 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3223 mobj = re.search(pattern, webpage)
3225 self._downloader.report_error(u'unable to extract video page URL')
3227 video_page = mobj.group(1)
3228 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
3229 self.report_extract_vid_page(video_page)
3232 # Extract video links on video page
3233 """Extract video links of all sizes"""
3234 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3235 mobj = re.findall(pattern, webpage)
3237 self._downloader.report_error(u'unable to extract video links')
3239 # Sort in resolution
3240 links = sorted(mobj)
3242 # Choose the lowest of the sort, i.e. highest resolution
3243 video_url = links[-1]
3244 # Only get the url. The resolution part in the tuple has no use anymore
3245 video_url = video_url[-1]
3246 # Treat escaped \u0026 style hex
3248 video_url = video_url.decode("unicode_escape")
3249 except AttributeError: # Python 3
3250 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3256 'uploader': uploader,
3257 'upload_date': upload_date,
3258 'title': video_title,
3259 'ext': video_extension,
3262 class NBAIE(InfoExtractor):
3263 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3266 def _real_extract(self, url):
3267 mobj = re.match(self._VALID_URL, url)
3269 self._downloader.report_error(u'invalid URL: %s' % url)
3272 video_id = mobj.group(1)
3273 if video_id.endswith('/index.html'):
3274 video_id = video_id[:-len('/index.html')]
3276 webpage = self._download_webpage(url, video_id)
3278 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3279 def _findProp(rexp, default=None):
3280 m = re.search(rexp, webpage)
3282 return unescapeHTML(m.group(1))
3286 shortened_video_id = video_id.rpartition('/')[2]
3287 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3289 'id': shortened_video_id,
3293 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3294 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3298 class JustinTVIE(InfoExtractor):
3299 """Information extractor for justin.tv and twitch.tv"""
3300 # TODO: One broadcast may be split into multiple videos. The key
3301 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3302 # starts at 1 and increases. Can we treat all parts as one video?
3304 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3305 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3306 _JUSTIN_PAGE_LIMIT = 100
3307 IE_NAME = u'justin.tv'
3309 def report_download_page(self, channel, offset):
3310 """Report attempt to download a single page of videos."""
3311 self.to_screen(u'%s: Downloading video information from %d to %d' %
3312 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3314 # Return count of items, list of *valid* items
3315 def _parse_page(self, url, video_id):
3316 webpage = self._download_webpage(url, video_id,
3317 u'Downloading video info JSON',
3318 u'unable to download video info JSON')
3320 response = json.loads(webpage)
3321 if type(response) != list:
3322 error_text = response.get('error', 'unknown error')
3323 self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3326 for clip in response:
3327 video_url = clip['video_file_url']
3329 video_extension = os.path.splitext(video_url)[1][1:]
3330 video_date = re.sub('-', '', clip['start_time'][:10])
3331 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3332 video_id = clip['id']
3333 video_title = clip.get('title', video_id)
3337 'title': video_title,
3338 'uploader': clip.get('channel_name', video_uploader_id),
3339 'uploader_id': video_uploader_id,
3340 'upload_date': video_date,
3341 'ext': video_extension,
3343 return (len(response), info)
3345 def _real_extract(self, url):
3346 mobj = re.match(self._VALID_URL, url)
3348 self._downloader.report_error(u'invalid URL: %s' % url)
3351 api = 'http://api.justin.tv'
3352 video_id = mobj.group(mobj.lastindex)
3354 if mobj.lastindex == 1:
3356 api += '/channel/archives/%s.json'
3358 api += '/broadcast/by_archive/%s.json'
3359 api = api % (video_id,)
3361 self.report_extraction(video_id)
3365 limit = self._JUSTIN_PAGE_LIMIT
3368 self.report_download_page(video_id, offset)
3369 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3370 page_count, page_info = self._parse_page(page_url, video_id)
3371 info.extend(page_info)
3372 if not paged or page_count != limit:
3377 class FunnyOrDieIE(InfoExtractor):
3378 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3380 def _real_extract(self, url):
3381 mobj = re.match(self._VALID_URL, url)
3383 self._downloader.report_error(u'invalid URL: %s' % url)
3386 video_id = mobj.group('id')
3387 webpage = self._download_webpage(url, video_id)
3389 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3391 self._downloader.report_error(u'unable to find video information')
3392 video_url = unescapeHTML(m.group('url'))
3394 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3396 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3398 self._downloader.report_error(u'Cannot find video title')
3399 title = clean_html(m.group('title'))
3401 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3403 desc = unescapeHTML(m.group('desc'))
3412 'description': desc,
3416 class SteamIE(InfoExtractor):
3417 _VALID_URL = r"""http://store\.steampowered\.com/
3419 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3421 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3425 def suitable(cls, url):
3426 """Receives a URL and returns True if suitable for this IE."""
3427 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3429 def _real_extract(self, url):
3430 m = re.match(self._VALID_URL, url, re.VERBOSE)
3431 gameID = m.group('gameID')
3432 videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3433 self.report_age_confirmation()
3434 webpage = self._download_webpage(videourl, gameID)
3435 game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3437 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3438 mweb = re.finditer(urlRE, webpage)
3439 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3440 titles = re.finditer(namesRE, webpage)
3441 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3442 thumbs = re.finditer(thumbsRE, webpage)
3444 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3445 video_id = vid.group('videoID')
3446 title = vtitle.group('videoName')
3447 video_url = vid.group('videoURL')
3448 video_thumb = thumb.group('thumbnail')
3450 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3455 'title': unescapeHTML(title),
3456 'thumbnail': video_thumb
3459 return [self.playlist_result(videos, gameID, game_title)]
3461 class UstreamIE(InfoExtractor):
3462 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3463 IE_NAME = u'ustream'
3465 def _real_extract(self, url):
3466 m = re.match(self._VALID_URL, url)
3467 video_id = m.group('videoID')
3468 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3469 webpage = self._download_webpage(url, video_id)
3470 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3471 title = m.group('title')
3472 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3473 uploader = m.group('uploader')
3479 'uploader': uploader
3483 class WorldStarHipHopIE(InfoExtractor):
3484 _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3485 IE_NAME = u'WorldStarHipHop'
3487 def _real_extract(self, url):
3488 _src_url = r"""(http://(hw-videos|hw-post1).*(?:mp4|flv))"""
3490 m = re.match(self._VALID_URL, url)
3491 video_id = m.group('id')
3493 webpage_src = self._download_webpage(url, video_id)
3495 mobj = re.search(_src_url, webpage_src)
3497 if mobj is not None:
3498 video_url = mobj.group()
3499 if 'mp4' in video_url:
3504 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3507 _title = r"""<title>(.*)</title>"""
3509 mobj = re.search(_title, webpage_src)
3511 if mobj is not None:
3512 title = mobj.group(1)
3514 title = 'World Start Hip Hop - %s' % time.ctime()
3516 _thumbnail = r"""rel="image_src" href="(.*)" />"""
3517 mobj = re.search(_thumbnail, webpage_src)
3519 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3520 if mobj is not None:
3521 thumbnail = mobj.group(1)
3523 _title = r"""candytitles.*>(.*)</span>"""
3524 mobj = re.search(_title, webpage_src)
3525 if mobj is not None:
3526 title = mobj.group(1)
3533 'thumbnail' : thumbnail,
3538 class RBMARadioIE(InfoExtractor):
3539 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3541 def _real_extract(self, url):
3542 m = re.match(self._VALID_URL, url)
3543 video_id = m.group('videoID')
3545 webpage = self._download_webpage(url, video_id)
3546 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3548 raise ExtractorError(u'Cannot find metadata')
3549 json_data = m.group(1)
3552 data = json.loads(json_data)
3553 except ValueError as e:
3554 raise ExtractorError(u'Invalid JSON: ' + str(e))
3556 video_url = data['akamai_url'] + '&cbr=256'
3557 url_parts = compat_urllib_parse_urlparse(video_url)
3558 video_ext = url_parts.path.rpartition('.')[2]
3563 'title': data['title'],
3564 'description': data.get('teaser_text'),
3565 'location': data.get('country_of_origin'),
3566 'uploader': data.get('host', {}).get('name'),
3567 'uploader_id': data.get('host', {}).get('slug'),
3568 'thumbnail': data.get('image', {}).get('large_url_2x'),
3569 'duration': data.get('duration'),
3574 class YouPornIE(InfoExtractor):
3575 """Information extractor for youporn.com."""
3576 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3578 def _print_formats(self, formats):
3579 """Print all available formats"""
3580 print(u'Available formats:')
3581 print(u'ext\t\tformat')
3582 print(u'---------------------------------')
3583 for format in formats:
3584 print(u'%s\t\t%s' % (format['ext'], format['format']))
3586 def _specific(self, req_format, formats):
3588 if(x["format"]==req_format):
3592 def _real_extract(self, url):
3593 mobj = re.match(self._VALID_URL, url)
3595 self._downloader.report_error(u'invalid URL: %s' % url)
3598 video_id = mobj.group('videoid')
3600 req = compat_urllib_request.Request(url)
3601 req.add_header('Cookie', 'age_verified=1')
3602 webpage = self._download_webpage(req, video_id)
3604 # Get the video title
3605 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3607 raise ExtractorError(u'Unable to extract video title')
3608 video_title = result.group('title').strip()
3610 # Get the video date
3611 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3613 self._downloader.report_warning(u'unable to extract video date')
3616 upload_date = unified_strdate(result.group('date').strip())
3618 # Get the video uploader
3619 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3621 self._downloader.report_warning(u'unable to extract uploader')
3622 video_uploader = None
3624 video_uploader = result.group('uploader').strip()
3625 video_uploader = clean_html( video_uploader )
3627 # Get all of the formats available
3628 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3629 result = re.search(DOWNLOAD_LIST_RE, webpage)
3631 raise ExtractorError(u'Unable to extract download list')
3632 download_list_html = result.group('download_list').strip()
3634 # Get all of the links from the page
3635 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3636 links = re.findall(LINK_RE, download_list_html)
3637 if(len(links) == 0):
3638 raise ExtractorError(u'ERROR: no known formats available for video')
3640 self.to_screen(u'Links found: %d' % len(links))
3645 # A link looks like this:
3646 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3647 # A path looks like this:
3648 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3649 video_url = unescapeHTML( link )
3650 path = compat_urllib_parse_urlparse( video_url ).path
3651 extension = os.path.splitext( path )[1][1:]
3652 format = path.split('/')[4].split('_')[:2]
3655 format = "-".join( format )
3656 title = u'%s-%s-%s' % (video_title, size, bitrate)
3661 'uploader': video_uploader,
3662 'upload_date': upload_date,
3667 'description': None,
3671 if self._downloader.params.get('listformats', None):
3672 self._print_formats(formats)
3675 req_format = self._downloader.params.get('format', None)
3676 self.to_screen(u'Format: %s' % req_format)
3678 if req_format is None or req_format == 'best':
3680 elif req_format == 'worst':
3681 return [formats[-1]]
3682 elif req_format in ('-1', 'all'):
3685 format = self._specific( req_format, formats )
3687 self._downloader.report_error(u'requested format not available')
3693 class PornotubeIE(InfoExtractor):
3694 """Information extractor for pornotube.com."""
3695 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3697 def _real_extract(self, url):
3698 mobj = re.match(self._VALID_URL, url)
3700 self._downloader.report_error(u'invalid URL: %s' % url)
3703 video_id = mobj.group('videoid')
3704 video_title = mobj.group('title')
3706 # Get webpage content
3707 webpage = self._download_webpage(url, video_id)
3710 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3711 result = re.search(VIDEO_URL_RE, webpage)
3713 self._downloader.report_error(u'unable to extract video url')
3715 video_url = compat_urllib_parse.unquote(result.group('url'))
3717 #Get the uploaded date
3718 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3719 result = re.search(VIDEO_UPLOADED_RE, webpage)
3721 self._downloader.report_error(u'unable to extract video title')
3723 upload_date = unified_strdate(result.group('date'))
3725 info = {'id': video_id,
3728 'upload_date': upload_date,
3729 'title': video_title,
3735 class YouJizzIE(InfoExtractor):
3736 """Information extractor for youjizz.com."""
3737 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3739 def _real_extract(self, url):
3740 mobj = re.match(self._VALID_URL, url)
3742 self._downloader.report_error(u'invalid URL: %s' % url)
3745 video_id = mobj.group('videoid')
3747 # Get webpage content
3748 webpage = self._download_webpage(url, video_id)
3750 # Get the video title
3751 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3753 raise ExtractorError(u'ERROR: unable to extract video title')
3754 video_title = result.group('title').strip()
3756 # Get the embed page
3757 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3759 raise ExtractorError(u'ERROR: unable to extract embed page')
3761 embed_page_url = result.group(0).strip()
3762 video_id = result.group('videoid')
3764 webpage = self._download_webpage(embed_page_url, video_id)
3767 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3769 raise ExtractorError(u'ERROR: unable to extract video url')
3770 video_url = result.group('source')
3772 info = {'id': video_id,
3774 'title': video_title,
3777 'player_url': embed_page_url}
3781 class EightTracksIE(InfoExtractor):
3783 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3785 def _real_extract(self, url):
3786 mobj = re.match(self._VALID_URL, url)
3788 raise ExtractorError(u'Invalid URL: %s' % url)
3789 playlist_id = mobj.group('id')
3791 webpage = self._download_webpage(url, playlist_id)
3793 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3795 raise ExtractorError(u'Cannot find trax information')
3796 json_like = m.group(1)
3797 data = json.loads(json_like)
3799 session = str(random.randint(0, 1000000000))
3801 track_count = data['tracks_count']
3802 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3803 next_url = first_url
3805 for i in itertools.count():
3806 api_json = self._download_webpage(next_url, playlist_id,
3807 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3808 errnote=u'Failed to download song information')
3809 api_data = json.loads(api_json)
3810 track_data = api_data[u'set']['track']
3812 'id': track_data['id'],
3813 'url': track_data['track_file_stream_url'],
3814 'title': track_data['performer'] + u' - ' + track_data['name'],
3815 'raw_title': track_data['name'],
3816 'uploader_id': data['user']['login'],
3820 if api_data['set']['at_last_track']:
3822 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3825 class KeekIE(InfoExtractor):
3826 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3829 def _real_extract(self, url):
3830 m = re.match(self._VALID_URL, url)
3831 video_id = m.group('videoID')
3832 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3833 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3834 webpage = self._download_webpage(url, video_id)
3835 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
3836 title = unescapeHTML(m.group('title'))
3837 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
3838 uploader = clean_html(m.group('uploader'))
3844 'thumbnail': thumbnail,
3845 'uploader': uploader
3849 class TEDIE(InfoExtractor):
3850 _VALID_URL=r'''http://www\.ted\.com/
3852 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3854 ((?P<type_talk>talks)) # We have a simple talk
3856 (/lang/(.*?))? # The url may contain the language
3857 /(?P<name>\w+) # Here goes the name and then ".html"
3861 def suitable(cls, url):
3862 """Receives a URL and returns True if suitable for this IE."""
3863 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3865 def _real_extract(self, url):
3866 m=re.match(self._VALID_URL, url, re.VERBOSE)
3867 if m.group('type_talk'):
3868 return [self._talk_info(url)]
3870 playlist_id=m.group('playlist_id')
3871 name=m.group('name')
3872 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
3873 return [self._playlist_videos_info(url,name,playlist_id)]
3875 def _talk_video_link(self,mediaSlug):
3876 '''Returns the video link for that mediaSlug'''
3877 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3879 def _playlist_videos_info(self,url,name,playlist_id=0):
3880 '''Returns the videos of the playlist'''
3882 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3883 ([.\s]*?)data-playlist_item_id="(\d+)"
3884 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
3886 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
3887 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
3888 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
3889 m_names=re.finditer(video_name_RE,webpage)
3891 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
3892 m_playlist = re.search(playlist_RE, webpage)
3893 playlist_title = m_playlist.group('playlist_title')
3895 playlist_entries = []
3896 for m_video, m_name in zip(m_videos,m_names):
3897 video_id=m_video.group('video_id')
3898 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
3899 playlist_entries.append(self.url_result(talk_url, 'TED'))
3900 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
3902 def _talk_info(self, url, video_id=0):
3903 """Return the video for the talk in the url"""
3904 m=re.match(self._VALID_URL, url,re.VERBOSE)
3905 videoName=m.group('name')
3906 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
3907 # If the url includes the language we get the title translated
3908 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
3909 title=re.search(title_RE, webpage).group('title')
3910 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
3911 "id":(?P<videoID>[\d]+).*?
3912 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
3913 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
3914 thumb_match=re.search(thumb_RE,webpage)
3915 info_match=re.search(info_RE,webpage,re.VERBOSE)
3916 video_id=info_match.group('videoID')
3917 mediaSlug=info_match.group('mediaSlug')
3918 video_url=self._talk_video_link(mediaSlug)
3924 'thumbnail': thumb_match.group('thumbnail')
3928 class MySpassIE(InfoExtractor):
3929 _VALID_URL = r'http://www.myspass.de/.*'
3931 def _real_extract(self, url):
3932 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
3934 # video id is the last path element of the URL
3935 # usually there is a trailing slash, so also try the second but last
3936 url_path = compat_urllib_parse_urlparse(url).path
3937 url_parent_path, video_id = os.path.split(url_path)
3939 _, video_id = os.path.split(url_parent_path)
3942 metadata_url = META_DATA_URL_TEMPLATE % video_id
3943 metadata_text = self._download_webpage(metadata_url, video_id)
3944 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
3946 # extract values from metadata
3947 url_flv_el = metadata.find('url_flv')
3948 if url_flv_el is None:
3949 self._downloader.report_error(u'unable to extract download url')
3951 video_url = url_flv_el.text
3952 extension = os.path.splitext(video_url)[1][1:]
3953 title_el = metadata.find('title')
3954 if title_el is None:
3955 self._downloader.report_error(u'unable to extract title')
3957 title = title_el.text
3958 format_id_el = metadata.find('format_id')
3959 if format_id_el is None:
3962 format = format_id_el.text
3963 description_el = metadata.find('description')
3964 if description_el is not None:
3965 description = description_el.text
3968 imagePreview_el = metadata.find('imagePreview')
3969 if imagePreview_el is not None:
3970 thumbnail = imagePreview_el.text
3979 'thumbnail': thumbnail,
3980 'description': description
3984 class SpiegelIE(InfoExtractor):
3985 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
3987 def _real_extract(self, url):
3988 m = re.match(self._VALID_URL, url)
3989 video_id = m.group('videoID')
3991 webpage = self._download_webpage(url, video_id)
3992 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
3994 raise ExtractorError(u'Cannot find title')
3995 video_title = unescapeHTML(m.group(1))
3997 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
3998 xml_code = self._download_webpage(xml_url, video_id,
3999 note=u'Downloading XML', errnote=u'Failed to download XML')
4001 idoc = xml.etree.ElementTree.fromstring(xml_code)
4002 last_type = idoc[-1]
4003 filename = last_type.findall('./filename')[0].text
4004 duration = float(last_type.findall('./duration')[0].text)
4006 video_url = 'http://video2.spiegel.de/flash/' + filename
4007 video_ext = filename.rpartition('.')[2]
4012 'title': video_title,
4013 'duration': duration,
4017 class LiveLeakIE(InfoExtractor):
4019 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4020 IE_NAME = u'liveleak'
4022 def _real_extract(self, url):
4023 mobj = re.match(self._VALID_URL, url)
4025 self._downloader.report_error(u'invalid URL: %s' % url)
4028 video_id = mobj.group('video_id')
4030 webpage = self._download_webpage(url, video_id)
4032 m = re.search(r'file: "(.*?)",', webpage)
4034 self._downloader.report_error(u'unable to find video url')
4036 video_url = m.group(1)
4038 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4040 self._downloader.report_error(u'Cannot find video title')
4041 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4043 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4045 desc = unescapeHTML(m.group('desc'))
4049 m = re.search(r'By:.*?(\w+)</a>', webpage)
4051 uploader = clean_html(m.group(1))
4060 'description': desc,
4061 'uploader': uploader
4066 class ARDIE(InfoExtractor):
4067 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4068 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4069 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4071 def _real_extract(self, url):
4072 # determine video id from url
4073 m = re.match(self._VALID_URL, url)
4075 numid = re.search(r'documentId=([0-9]+)', url)
4077 video_id = numid.group(1)
4079 video_id = m.group('video_id')
4081 # determine title and media streams from webpage
4082 html = self._download_webpage(url, video_id)
4083 title = re.search(self._TITLE, html).group('title')
4084 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4086 assert '"fsk"' in html
4087 self._downloader.report_error(u'this video is only available after 8:00 pm')
4090 # choose default media type and highest quality for now
4091 stream = max([s for s in streams if int(s["media_type"]) == 0],
4092 key=lambda s: int(s["quality"]))
4094 # there's two possibilities: RTMP stream or HTTP download
4095 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4096 if stream['rtmp_url']:
4097 self.to_screen(u'RTMP download detected')
4098 assert stream['video_url'].startswith('mp4:')
4099 info["url"] = stream["rtmp_url"]
4100 info["play_path"] = stream['video_url']
4102 assert stream["video_url"].endswith('.mp4')
4103 info["url"] = stream["video_url"]
4106 class TumblrIE(InfoExtractor):
4107 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4109 def _real_extract(self, url):
4110 m_url = re.match(self._VALID_URL, url)
4111 video_id = m_url.group('id')
4112 blog = m_url.group('blog_name')
4114 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4115 webpage = self._download_webpage(url, video_id)
4117 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4118 video = re.search(re_video, webpage)
4120 self.to_screen("No video founded")
4122 video_url = video.group('video_url')
4123 ext = video.group('ext')
4125 re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22' # We pick the first poster
4126 thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4128 # The only place where you can get a title, it's not complete,
4129 # but searching in other places doesn't work for all videos
4130 re_title = r'<title>(?P<title>.*?)</title>'
4131 title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
4133 return [{'id': video_id,
4140 class BandcampIE(InfoExtractor):
4141 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
4143 def _real_extract(self, url):
4144 mobj = re.match(self._VALID_URL, url)
4145 title = mobj.group('title')
4146 webpage = self._download_webpage(url, title)
4147 # We get the link to the free download page
4148 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
4149 if m_download is None:
4150 self._downloader.report_error('No free songs founded')
4152 download_link = m_download.group(1)
4153 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
4154 webpage, re.MULTILINE|re.DOTALL).group('id')
4156 download_webpage = self._download_webpage(download_link, id,
4157 'Downloading free downloads page')
4158 # We get the dictionary of the track from some javascrip code
4159 info = re.search(r'items: (.*?),$',
4160 download_webpage, re.MULTILINE).group(1)
4161 info = json.loads(info)[0]
4162 # We pick mp3-320 for now, until format selection can be easily implemented.
4163 mp3_info = info[u'downloads'][u'mp3-320']
4164 # If we try to use this url it says the link has expired
4165 initial_url = mp3_info[u'url']
4166 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
4167 m_url = re.match(re_url, initial_url)
4168 #We build the url we will use to get the final track url
4169 # This url is build in Bandcamp in the script download_bunde_*.js
4170 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
4171 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
4172 # If we could correctly generate the .rand field the url would be
4173 #in the "download_url" key
4174 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
4176 track_info = {'id':id,
4177 'title' : info[u'title'],
4180 'thumbnail' : info[u'thumb_url'],
4181 'uploader' : info[u'artist']
4186 class RedtubeIE(InfoExtractor):
4187 """Information Extractor for redtube"""
4188 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
4189 IE_NAME = u'redtube'
4191 def _real_extract(self,url):
4192 mobj = re.match(self._VALID_URL, url)
4194 self._downloader.report_error(u'invalid URL: %s' % url)
4196 video_id = mobj.group('id')
4197 video_extension = 'mp4'
4198 webpage = self._download_webpage(url, video_id)
4199 self.report_extraction(video_id)
4200 mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
4201 if mobj is not None:
4202 video_url = mobj.group(1)
4204 self._downloader.report_error(u'unable to extract media URL')
4206 mobj = re.search('<h1 class="videoTitle slidePanelMovable">'+r'(.+)'+r'</h1>',webpage)
4207 if mobj is not None:
4208 video_title = mobj.group(1)
4210 video_title = 'Redtube - %s' % time.ctime()
4215 'ext': video_extension,
4216 'title': video_title,
4220 def gen_extractors():
4221 """ Return a list of an instance of every supported extractor.
4222 The order does matter; the first extractor matched is the one handling the URL.
4225 YoutubePlaylistIE(),
4250 StanfordOpenClassroomIE(),
4260 WorldStarHipHopIE(),
4279 def get_info_extractor(ie_name):
4280 """Returns the info extractor class with the given ie_name"""
4281 return globals()[ie_name+'IE']