2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 note = u'Downloading video webpage'
118 if note is not False:
119 self.to_screen(u'%s: %s' % (video_id, note))
121 return compat_urllib_request.urlopen(url_or_request)
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
124 errnote = u'Unable to download webpage'
125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
127 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns the data of the page as a string """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
133 encoding = m.group(1)
136 webpage_bytes = urlh.read()
137 if self._downloader.params.get('dump_intermediate_pages', False):
139 url = url_or_request.get_full_url()
140 except AttributeError:
142 self.to_screen(u'Dumping request to ' + url)
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
145 return webpage_bytes.decode(encoding, 'replace')
147 def to_screen(self, msg):
148 """Print msg to screen, prefixing it with '[ie_name]'"""
149 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
151 #Methods for following #608
152 #They set the correct value of the '_type' key
153 def video_result(self, video_info):
154 """Returns a video"""
155 video_info['_type'] = 'video'
157 def url_result(self, url, ie=None):
158 """Returns a url that points to a page that should be processed"""
159 #TODO: ie should be the class used for getting the info
160 video_info = {'_type': 'url',
164 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
165 """Returns a playlist"""
166 video_info = {'_type': 'playlist',
169 video_info['id'] = playlist_id
171 video_info['title'] = playlist_title
175 class YoutubeIE(InfoExtractor):
176 """Information extractor for youtube.com."""
180 (?:https?://)? # http(s):// (optional)
181 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
182 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
183 (?:.*?\#/)? # handle anchor (#/) redirect urls
184 (?: # the various things that can precede the ID:
185 (?:(?:v|embed|e)/) # v/ or embed/ or e/
186 |(?: # or the v= param in all its forms
187 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
188 (?:\?|\#!?) # the params delimiter ? or # or #!
189 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
192 )? # optional -> youtube.com/xxxx is OK
193 )? # all until now is optional -> you can pass the naked ID
194 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
195 (?(1).+)? # if we found the ID, everything can follow
197 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
198 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
199 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
200 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
201 _NETRC_MACHINE = 'youtube'
202 # Listed in order of quality
203 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
204 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
205 _video_extensions = {
211 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
217 _video_dimensions = {
236 def suitable(cls, url):
237 """Receives a URL and returns True if suitable for this IE."""
238 if YoutubePlaylistIE.suitable(url): return False
239 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
241 def report_lang(self):
242 """Report attempt to set language."""
243 self.to_screen(u'Setting language')
245 def report_login(self):
246 """Report attempt to log in."""
247 self.to_screen(u'Logging in')
249 def report_age_confirmation(self):
250 """Report attempt to confirm age."""
251 self.to_screen(u'Confirming age')
253 def report_video_webpage_download(self, video_id):
254 """Report attempt to download video webpage."""
255 self.to_screen(u'%s: Downloading video webpage' % video_id)
257 def report_video_info_webpage_download(self, video_id):
258 """Report attempt to download video info webpage."""
259 self.to_screen(u'%s: Downloading video info webpage' % video_id)
261 def report_video_subtitles_download(self, video_id):
262 """Report attempt to download video info webpage."""
263 self.to_screen(u'%s: Checking available subtitles' % video_id)
265 def report_video_subtitles_request(self, video_id, sub_lang, format):
266 """Report attempt to download video info webpage."""
267 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
269 def report_video_subtitles_available(self, video_id, sub_lang_list):
270 """Report available subtitles."""
271 sub_lang = ",".join(list(sub_lang_list.keys()))
272 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
274 def report_information_extraction(self, video_id):
275 """Report attempt to extract video information."""
276 self.to_screen(u'%s: Extracting video information' % video_id)
278 def report_unavailable_format(self, video_id, format):
279 """Report extracted video URL."""
280 self.to_screen(u'%s: Format %s not available' % (video_id, format))
282 def report_rtmp_download(self):
283 """Indicate the download will use the RTMP protocol."""
284 self.to_screen(u'RTMP download detected')
286 def _get_available_subtitles(self, video_id):
287 self.report_video_subtitles_download(video_id)
288 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
290 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
291 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
292 return (u'unable to download video subtitles: %s' % compat_str(err), None)
293 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
294 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
295 if not sub_lang_list:
296 return (u'video doesn\'t have subtitles', None)
299 def _list_available_subtitles(self, video_id):
300 sub_lang_list = self._get_available_subtitles(video_id)
301 self.report_video_subtitles_available(video_id, sub_lang_list)
303 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
306 (error_message, sub_lang, sub)
308 self.report_video_subtitles_request(video_id, sub_lang, format)
309 params = compat_urllib_parse.urlencode({
315 url = 'http://www.youtube.com/api/timedtext?' + params
317 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
318 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
319 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
321 return (u'Did not fetch video subtitles', None, None)
322 return (None, sub_lang, sub)
324 def _extract_subtitle(self, video_id):
326 Return a list with a tuple:
327 [(error_message, sub_lang, sub)]
329 sub_lang_list = self._get_available_subtitles(video_id)
330 sub_format = self._downloader.params.get('subtitlesformat')
331 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
332 return [(sub_lang_list[0], None, None)]
333 if self._downloader.params.get('subtitleslang', False):
334 sub_lang = self._downloader.params.get('subtitleslang')
335 elif 'en' in sub_lang_list:
338 sub_lang = list(sub_lang_list.keys())[0]
339 if not sub_lang in sub_lang_list:
340 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
342 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
345 def _extract_all_subtitles(self, video_id):
346 sub_lang_list = self._get_available_subtitles(video_id)
347 sub_format = self._downloader.params.get('subtitlesformat')
348 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
349 return [(sub_lang_list[0], None, None)]
351 for sub_lang in sub_lang_list:
352 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
353 subtitles.append(subtitle)
356 def _print_formats(self, formats):
357 print('Available formats:')
359 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
361 def _real_initialize(self):
362 if self._downloader is None:
367 downloader_params = self._downloader.params
369 # Attempt to use provided username and password or .netrc data
370 if downloader_params.get('username', None) is not None:
371 username = downloader_params['username']
372 password = downloader_params['password']
373 elif downloader_params.get('usenetrc', False):
375 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
380 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
381 except (IOError, netrc.NetrcParseError) as err:
382 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
386 request = compat_urllib_request.Request(self._LANG_URL)
389 compat_urllib_request.urlopen(request).read()
390 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
391 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
394 # No authentication to be performed
398 request = compat_urllib_request.Request(self._LOGIN_URL)
400 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
401 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
402 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
407 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
409 galx = match.group(1)
411 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
417 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
421 u'PersistentCookie': u'yes',
423 u'bgresponse': u'js_disabled',
424 u'checkConnection': u'',
425 u'checkedDomains': u'youtube',
431 u'signIn': u'Sign in',
433 u'service': u'youtube',
437 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
439 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
440 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
441 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
444 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
445 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
446 self._downloader.report_warning(u'unable to log in: bad username or password')
448 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
449 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
455 'action_confirm': 'Confirm',
457 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
459 self.report_age_confirmation()
460 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
461 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
462 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
465 def _extract_id(self, url):
466 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
468 self._downloader.report_error(u'invalid URL: %s' % url)
470 video_id = mobj.group(2)
473 def _real_extract(self, url):
474 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
475 mobj = re.search(self._NEXT_URL_RE, url)
477 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
478 video_id = self._extract_id(url)
481 self.report_video_webpage_download(video_id)
482 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
483 request = compat_urllib_request.Request(url)
485 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
486 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
487 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
490 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
492 # Attempt to extract SWF player URL
493 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
495 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
500 self.report_video_info_webpage_download(video_id)
501 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
502 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
503 % (video_id, el_type))
504 video_info_webpage = self._download_webpage(video_info_url, video_id,
506 errnote='unable to download video info webpage')
507 video_info = compat_parse_qs(video_info_webpage)
508 if 'token' in video_info:
510 if 'token' not in video_info:
511 if 'reason' in video_info:
512 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
514 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
517 # Check for "rental" videos
518 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
519 self._downloader.report_error(u'"rental" videos not supported')
522 # Start extracting information
523 self.report_information_extraction(video_id)
526 if 'author' not in video_info:
527 self._downloader.report_error(u'unable to extract uploader name')
529 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
532 video_uploader_id = None
533 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
535 video_uploader_id = mobj.group(1)
537 self._downloader.report_warning(u'unable to extract uploader nickname')
540 if 'title' not in video_info:
541 self._downloader.report_error(u'unable to extract video title')
543 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
546 if 'thumbnail_url' not in video_info:
547 self._downloader.report_warning(u'unable to extract video thumbnail')
549 else: # don't panic if we can't find it
550 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
554 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
556 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
557 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
558 for expression in format_expressions:
560 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
565 video_description = get_element_by_id("eow-description", video_webpage)
566 if video_description:
567 video_description = clean_html(video_description)
569 video_description = ''
572 video_subtitles = None
574 if self._downloader.params.get('writesubtitles', False):
575 video_subtitles = self._extract_subtitle(video_id)
577 (sub_error, sub_lang, sub) = video_subtitles[0]
579 self._downloader.report_error(sub_error)
581 if self._downloader.params.get('allsubtitles', False):
582 video_subtitles = self._extract_all_subtitles(video_id)
583 for video_subtitle in video_subtitles:
584 (sub_error, sub_lang, sub) = video_subtitle
586 self._downloader.report_error(sub_error)
588 if self._downloader.params.get('listsubtitles', False):
589 sub_lang_list = self._list_available_subtitles(video_id)
592 if 'length_seconds' not in video_info:
593 self._downloader.report_warning(u'unable to extract video duration')
596 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
599 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
601 # Decide which formats to download
602 req_format = self._downloader.params.get('format', None)
604 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
605 self.report_rtmp_download()
606 video_url_list = [(None, video_info['conn'][0])]
607 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
608 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
609 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
610 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
611 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
613 format_limit = self._downloader.params.get('format_limit', None)
614 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
615 if format_limit is not None and format_limit in available_formats:
616 format_list = available_formats[available_formats.index(format_limit):]
618 format_list = available_formats
619 existing_formats = [x for x in format_list if x in url_map]
620 if len(existing_formats) == 0:
621 self._downloader.report_error(u'no known formats available for video')
623 if self._downloader.params.get('listformats', None):
624 self._print_formats(existing_formats)
626 if req_format is None or req_format == 'best':
627 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
628 elif req_format == 'worst':
629 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
630 elif req_format in ('-1', 'all'):
631 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
633 # Specific formats. We pick the first in a slash-delimeted sequence.
634 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
635 req_formats = req_format.split('/')
636 video_url_list = None
637 for rf in req_formats:
639 video_url_list = [(rf, url_map[rf])]
641 if video_url_list is None:
642 self._downloader.report_error(u'requested format not available')
645 self._downloader.report_error(u'no conn or url_encoded_fmt_stream_map information found in video info')
649 for format_param, video_real_url in video_url_list:
651 video_extension = self._video_extensions.get(format_param, 'flv')
653 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
654 self._video_dimensions.get(format_param, '???'))
658 'url': video_real_url,
659 'uploader': video_uploader,
660 'uploader_id': video_uploader_id,
661 'upload_date': upload_date,
662 'title': video_title,
663 'ext': video_extension,
664 'format': video_format,
665 'thumbnail': video_thumbnail,
666 'description': video_description,
667 'player_url': player_url,
668 'subtitles': video_subtitles,
669 'duration': video_duration
674 class MetacafeIE(InfoExtractor):
675 """Information Extractor for metacafe.com."""
677 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
678 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
679 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
680 IE_NAME = u'metacafe'
682 def __init__(self, downloader=None):
683 InfoExtractor.__init__(self, downloader)
685 def report_disclaimer(self):
686 """Report disclaimer retrieval."""
687 self.to_screen(u'Retrieving disclaimer')
689 def report_age_confirmation(self):
690 """Report attempt to confirm age."""
691 self.to_screen(u'Confirming age')
693 def report_download_webpage(self, video_id):
694 """Report webpage download."""
695 self.to_screen(u'%s: Downloading webpage' % video_id)
697 def report_extraction(self, video_id):
698 """Report information extraction."""
699 self.to_screen(u'%s: Extracting information' % video_id)
701 def _real_initialize(self):
702 # Retrieve disclaimer
703 request = compat_urllib_request.Request(self._DISCLAIMER)
705 self.report_disclaimer()
706 disclaimer = compat_urllib_request.urlopen(request).read()
707 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
708 self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
714 'submit': "Continue - I'm over 18",
716 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
718 self.report_age_confirmation()
719 disclaimer = compat_urllib_request.urlopen(request).read()
720 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
721 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
724 def _real_extract(self, url):
725 # Extract id and simplified title from URL
726 mobj = re.match(self._VALID_URL, url)
728 self._downloader.report_error(u'invalid URL: %s' % url)
731 video_id = mobj.group(1)
733 # Check if video comes from YouTube
734 mobj2 = re.match(r'^yt-(.*)$', video_id)
735 if mobj2 is not None:
736 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
738 # Retrieve video webpage to extract further information
739 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
741 # Extract URL, uploader and title from webpage
742 self.report_extraction(video_id)
743 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
745 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
746 video_extension = mediaURL[-3:]
748 # Extract gdaKey if available
749 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
753 gdaKey = mobj.group(1)
754 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
756 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
758 self._downloader.report_error(u'unable to extract media URL')
760 vardict = compat_parse_qs(mobj.group(1))
761 if 'mediaData' not in vardict:
762 self._downloader.report_error(u'unable to extract media URL')
764 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
766 self._downloader.report_error(u'unable to extract media URL')
768 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
769 video_extension = mediaURL[-3:]
770 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
772 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
774 self._downloader.report_error(u'unable to extract title')
776 video_title = mobj.group(1).decode('utf-8')
778 mobj = re.search(r'submitter=(.*?);', webpage)
780 self._downloader.report_error(u'unable to extract uploader nickname')
782 video_uploader = mobj.group(1)
785 'id': video_id.decode('utf-8'),
786 'url': video_url.decode('utf-8'),
787 'uploader': video_uploader.decode('utf-8'),
789 'title': video_title,
790 'ext': video_extension.decode('utf-8'),
794 class DailymotionIE(InfoExtractor):
795 """Information Extractor for Dailymotion"""
797 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
798 IE_NAME = u'dailymotion'
801 def __init__(self, downloader=None):
802 InfoExtractor.__init__(self, downloader)
804 def report_extraction(self, video_id):
805 """Report information extraction."""
806 self.to_screen(u'%s: Extracting information' % video_id)
808 def _real_extract(self, url):
809 # Extract id and simplified title from URL
810 mobj = re.match(self._VALID_URL, url)
812 self._downloader.report_error(u'invalid URL: %s' % url)
815 video_id = mobj.group(1).split('_')[0].split('?')[0]
817 video_extension = 'mp4'
819 # Retrieve video webpage to extract further information
820 request = compat_urllib_request.Request(url)
821 request.add_header('Cookie', 'family_filter=off')
822 webpage = self._download_webpage(request, video_id)
824 # Extract URL, uploader and title from webpage
825 self.report_extraction(video_id)
826 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
828 self._downloader.report_error(u'unable to extract media URL')
830 flashvars = compat_urllib_parse.unquote(mobj.group(1))
832 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
835 self.to_screen(u'Using %s' % key)
838 self._downloader.report_error(u'unable to extract video URL')
841 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
843 self._downloader.report_error(u'unable to extract video URL')
846 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
848 # TODO: support choosing qualities
850 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
852 self._downloader.report_error(u'unable to extract title')
854 video_title = unescapeHTML(mobj.group('title'))
856 video_uploader = None
857 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
859 # lookin for official user
860 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
861 if mobj_official is None:
862 self._downloader.report_warning(u'unable to extract uploader nickname')
864 video_uploader = mobj_official.group(1)
866 video_uploader = mobj.group(1)
868 video_upload_date = None
869 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
871 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
876 'uploader': video_uploader,
877 'upload_date': video_upload_date,
878 'title': video_title,
879 'ext': video_extension,
883 class PhotobucketIE(InfoExtractor):
884 """Information extractor for photobucket.com."""
886 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
887 IE_NAME = u'photobucket'
889 def __init__(self, downloader=None):
890 InfoExtractor.__init__(self, downloader)
892 def report_download_webpage(self, video_id):
893 """Report webpage download."""
894 self.to_screen(u'%s: Downloading webpage' % video_id)
896 def report_extraction(self, video_id):
897 """Report information extraction."""
898 self.to_screen(u'%s: Extracting information' % video_id)
900 def _real_extract(self, url):
901 # Extract id from URL
902 mobj = re.match(self._VALID_URL, url)
904 self._downloader.report_error(u'Invalid URL: %s' % url)
907 video_id = mobj.group(1)
909 video_extension = 'flv'
911 # Retrieve video webpage to extract further information
912 request = compat_urllib_request.Request(url)
914 self.report_download_webpage(video_id)
915 webpage = compat_urllib_request.urlopen(request).read()
916 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
917 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
920 # Extract URL, uploader, and title from webpage
921 self.report_extraction(video_id)
922 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
924 self._downloader.report_error(u'unable to extract media URL')
926 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
930 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
932 self._downloader.report_error(u'unable to extract title')
934 video_title = mobj.group(1).decode('utf-8')
936 video_uploader = mobj.group(2).decode('utf-8')
939 'id': video_id.decode('utf-8'),
940 'url': video_url.decode('utf-8'),
941 'uploader': video_uploader,
943 'title': video_title,
944 'ext': video_extension.decode('utf-8'),
948 class YahooIE(InfoExtractor):
949 """Information extractor for video.yahoo.com."""
952 # _VALID_URL matches all Yahoo! Video URLs
953 # _VPAGE_URL matches only the extractable '/watch/' URLs
954 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
955 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
956 IE_NAME = u'video.yahoo'
958 def __init__(self, downloader=None):
959 InfoExtractor.__init__(self, downloader)
961 def report_download_webpage(self, video_id):
962 """Report webpage download."""
963 self.to_screen(u'%s: Downloading webpage' % video_id)
965 def report_extraction(self, video_id):
966 """Report information extraction."""
967 self.to_screen(u'%s: Extracting information' % video_id)
969 def _real_extract(self, url, new_video=True):
970 # Extract ID from URL
971 mobj = re.match(self._VALID_URL, url)
973 self._downloader.report_error(u'Invalid URL: %s' % url)
976 video_id = mobj.group(2)
977 video_extension = 'flv'
979 # Rewrite valid but non-extractable URLs as
980 # extractable English language /watch/ URLs
981 if re.match(self._VPAGE_URL, url) is None:
982 request = compat_urllib_request.Request(url)
984 webpage = compat_urllib_request.urlopen(request).read()
985 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
986 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
989 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
991 self._downloader.report_error(u'Unable to extract id field')
993 yahoo_id = mobj.group(1)
995 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
997 self._downloader.report_error(u'Unable to extract vid field')
999 yahoo_vid = mobj.group(1)
1001 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
1002 return self._real_extract(url, new_video=False)
1004 # Retrieve video webpage to extract further information
1005 request = compat_urllib_request.Request(url)
1007 self.report_download_webpage(video_id)
1008 webpage = compat_urllib_request.urlopen(request).read()
1009 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1010 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1013 # Extract uploader and title from webpage
1014 self.report_extraction(video_id)
1015 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1017 self._downloader.report_error(u'unable to extract video title')
1019 video_title = mobj.group(1).decode('utf-8')
1021 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1023 self._downloader.report_error(u'unable to extract video uploader')
1025 video_uploader = mobj.group(1).decode('utf-8')
1027 # Extract video thumbnail
1028 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1030 self._downloader.report_error(u'unable to extract video thumbnail')
1032 video_thumbnail = mobj.group(1).decode('utf-8')
1034 # Extract video description
1035 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1037 self._downloader.report_error(u'unable to extract video description')
1039 video_description = mobj.group(1).decode('utf-8')
1040 if not video_description:
1041 video_description = 'No description available.'
1043 # Extract video height and width
1044 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1046 self._downloader.report_error(u'unable to extract video height')
1048 yv_video_height = mobj.group(1)
1050 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1052 self._downloader.report_error(u'unable to extract video width')
1054 yv_video_width = mobj.group(1)
1056 # Retrieve video playlist to extract media URL
1057 # I'm not completely sure what all these options are, but we
1058 # seem to need most of them, otherwise the server sends a 401.
1059 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1060 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1061 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1062 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1063 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1065 self.report_download_webpage(video_id)
1066 webpage = compat_urllib_request.urlopen(request).read()
1067 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1068 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1071 # Extract media URL from playlist XML
1072 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1074 self._downloader.report_error(u'Unable to extract media URL')
1076 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1077 video_url = unescapeHTML(video_url)
1080 'id': video_id.decode('utf-8'),
1082 'uploader': video_uploader,
1083 'upload_date': None,
1084 'title': video_title,
1085 'ext': video_extension.decode('utf-8'),
1086 'thumbnail': video_thumbnail.decode('utf-8'),
1087 'description': video_description,
1091 class VimeoIE(InfoExtractor):
1092 """Information extractor for vimeo.com."""
1094 # _VALID_URL matches Vimeo URLs
1095 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1098 def __init__(self, downloader=None):
1099 InfoExtractor.__init__(self, downloader)
1101 def report_download_webpage(self, video_id):
1102 """Report webpage download."""
1103 self.to_screen(u'%s: Downloading webpage' % video_id)
1105 def report_extraction(self, video_id):
1106 """Report information extraction."""
1107 self.to_screen(u'%s: Extracting information' % video_id)
1109 def _real_extract(self, url, new_video=True):
1110 # Extract ID from URL
1111 mobj = re.match(self._VALID_URL, url)
1113 self._downloader.report_error(u'Invalid URL: %s' % url)
1116 video_id = mobj.group('id')
1117 if not mobj.group('proto'):
1118 url = 'https://' + url
1119 if mobj.group('direct_link'):
1120 url = 'https://vimeo.com/' + video_id
1122 # Retrieve video webpage to extract further information
1123 request = compat_urllib_request.Request(url, None, std_headers)
1125 self.report_download_webpage(video_id)
1126 webpage_bytes = compat_urllib_request.urlopen(request).read()
1127 webpage = webpage_bytes.decode('utf-8')
1128 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1129 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1132 # Now we begin extracting as much information as we can from what we
1133 # retrieved. First we extract the information common to all extractors,
1134 # and latter we extract those that are Vimeo specific.
1135 self.report_extraction(video_id)
1137 # Extract the config JSON
1139 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1140 config = json.loads(config)
1142 self._downloader.report_error(u'unable to extract info section')
1146 video_title = config["video"]["title"]
1148 # Extract uploader and uploader_id
1149 video_uploader = config["video"]["owner"]["name"]
1150 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1152 # Extract video thumbnail
1153 video_thumbnail = config["video"]["thumbnail"]
1155 # Extract video description
1156 video_description = get_element_by_attribute("itemprop", "description", webpage)
1157 if video_description: video_description = clean_html(video_description)
1158 else: video_description = u''
1160 # Extract upload date
1161 video_upload_date = None
1162 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1163 if mobj is not None:
1164 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1166 # Vimeo specific: extract request signature and timestamp
1167 sig = config['request']['signature']
1168 timestamp = config['request']['timestamp']
1170 # Vimeo specific: extract video codec and quality information
1171 # First consider quality, then codecs, then take everything
1172 # TODO bind to format param
1173 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1174 files = { 'hd': [], 'sd': [], 'other': []}
1175 for codec_name, codec_extension in codecs:
1176 if codec_name in config["video"]["files"]:
1177 if 'hd' in config["video"]["files"][codec_name]:
1178 files['hd'].append((codec_name, codec_extension, 'hd'))
1179 elif 'sd' in config["video"]["files"][codec_name]:
1180 files['sd'].append((codec_name, codec_extension, 'sd'))
1182 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1184 for quality in ('hd', 'sd', 'other'):
1185 if len(files[quality]) > 0:
1186 video_quality = files[quality][0][2]
1187 video_codec = files[quality][0][0]
1188 video_extension = files[quality][0][1]
1189 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1192 self._downloader.report_error(u'no known codec found')
1195 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1196 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1201 'uploader': video_uploader,
1202 'uploader_id': video_uploader_id,
1203 'upload_date': video_upload_date,
1204 'title': video_title,
1205 'ext': video_extension,
1206 'thumbnail': video_thumbnail,
1207 'description': video_description,
1211 class ArteTvIE(InfoExtractor):
1212 """arte.tv information extractor."""
1214 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1215 _LIVE_URL = r'index-[0-9]+\.html$'
1217 IE_NAME = u'arte.tv'
1219 def __init__(self, downloader=None):
1220 InfoExtractor.__init__(self, downloader)
1222 def report_download_webpage(self, video_id):
1223 """Report webpage download."""
1224 self.to_screen(u'%s: Downloading webpage' % video_id)
1226 def report_extraction(self, video_id):
1227 """Report information extraction."""
1228 self.to_screen(u'%s: Extracting information' % video_id)
1230 def fetch_webpage(self, url):
1231 request = compat_urllib_request.Request(url)
1233 self.report_download_webpage(url)
1234 webpage = compat_urllib_request.urlopen(request).read()
1235 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1236 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1238 except ValueError as err:
1239 self._downloader.report_error(u'Invalid URL: %s' % url)
1243 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1244 page = self.fetch_webpage(url)
1245 mobj = re.search(regex, page, regexFlags)
1249 self._downloader.report_error(u'Invalid URL: %s' % url)
1252 for (i, key, err) in matchTuples:
1253 if mobj.group(i) is None:
1254 self._downloader.trouble(err)
1257 info[key] = mobj.group(i)
1261 def extractLiveStream(self, url):
1262 video_lang = url.split('/')[-4]
1263 info = self.grep_webpage(
1265 r'src="(.*?/videothek_js.*?\.js)',
1268 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1271 http_host = url.split('/')[2]
1272 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1273 info = self.grep_webpage(
1275 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1276 '(http://.*?\.swf).*?' +
1280 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1281 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1282 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1285 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1287 def extractPlus7Stream(self, url):
1288 video_lang = url.split('/')[-3]
1289 info = self.grep_webpage(
1291 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1294 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1297 next_url = compat_urllib_parse.unquote(info.get('url'))
1298 info = self.grep_webpage(
1300 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1303 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1306 next_url = compat_urllib_parse.unquote(info.get('url'))
1308 info = self.grep_webpage(
1310 r'<video id="(.*?)".*?>.*?' +
1311 '<name>(.*?)</name>.*?' +
1312 '<dateVideo>(.*?)</dateVideo>.*?' +
1313 '<url quality="hd">(.*?)</url>',
1316 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1317 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1318 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1319 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1324 'id': info.get('id'),
1325 'url': compat_urllib_parse.unquote(info.get('url')),
1326 'uploader': u'arte.tv',
1327 'upload_date': info.get('date'),
1328 'title': info.get('title').decode('utf-8'),
1334 def _real_extract(self, url):
1335 video_id = url.split('/')[-1]
1336 self.report_extraction(video_id)
1338 if re.search(self._LIVE_URL, video_id) is not None:
1339 self.extractLiveStream(url)
1342 info = self.extractPlus7Stream(url)
1347 class GenericIE(InfoExtractor):
1348 """Generic last-resort information extractor."""
1351 IE_NAME = u'generic'
1353 def __init__(self, downloader=None):
1354 InfoExtractor.__init__(self, downloader)
1356 def report_download_webpage(self, video_id):
1357 """Report webpage download."""
1358 if not self._downloader.params.get('test', False):
1359 self._downloader.report_warning(u'Falling back on generic information extractor.')
1360 self.to_screen(u'%s: Downloading webpage' % video_id)
1362 def report_extraction(self, video_id):
1363 """Report information extraction."""
1364 self.to_screen(u'%s: Extracting information' % video_id)
1366 def report_following_redirect(self, new_url):
1367 """Report information extraction."""
1368 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1370 def _test_redirect(self, url):
1371 """Check if it is a redirect, like url shorteners, in case return the new url."""
1372 class HeadRequest(compat_urllib_request.Request):
1373 def get_method(self):
1376 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1378 Subclass the HTTPRedirectHandler to make it use our
1379 HeadRequest also on the redirected URL
1381 def redirect_request(self, req, fp, code, msg, headers, newurl):
1382 if code in (301, 302, 303, 307):
1383 newurl = newurl.replace(' ', '%20')
1384 newheaders = dict((k,v) for k,v in req.headers.items()
1385 if k.lower() not in ("content-length", "content-type"))
1386 return HeadRequest(newurl,
1388 origin_req_host=req.get_origin_req_host(),
1391 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1393 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1395 Fallback to GET if HEAD is not allowed (405 HTTP error)
1397 def http_error_405(self, req, fp, code, msg, headers):
1401 newheaders = dict((k,v) for k,v in req.headers.items()
1402 if k.lower() not in ("content-length", "content-type"))
1403 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1405 origin_req_host=req.get_origin_req_host(),
1409 opener = compat_urllib_request.OpenerDirector()
1410 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1411 HTTPMethodFallback, HEADRedirectHandler,
1412 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1413 opener.add_handler(handler())
1415 response = opener.open(HeadRequest(url))
1416 new_url = response.geturl()
1421 self.report_following_redirect(new_url)
1424 def _real_extract(self, url):
1425 new_url = self._test_redirect(url)
1426 if new_url: return [self.url_result(new_url)]
1428 video_id = url.split('/')[-1]
1430 webpage = self._download_webpage(url, video_id)
1431 except ValueError as err:
1432 # since this is the last-resort InfoExtractor, if
1433 # this error is thrown, it'll be thrown here
1434 self._downloader.report_error(u'Invalid URL: %s' % url)
1437 self.report_extraction(video_id)
1438 # Start with something easy: JW Player in SWFObject
1439 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1441 # Broaden the search a little bit
1442 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1444 # Broaden the search a little bit: JWPlayer JS loader
1445 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1447 self._downloader.report_error(u'Invalid URL: %s' % url)
1450 # It's possible that one of the regexes
1451 # matched, but returned an empty group:
1452 if mobj.group(1) is None:
1453 self._downloader.report_error(u'Invalid URL: %s' % url)
1456 video_url = compat_urllib_parse.unquote(mobj.group(1))
1457 video_id = os.path.basename(video_url)
1459 # here's a fun little line of code for you:
1460 video_extension = os.path.splitext(video_id)[1][1:]
1461 video_id = os.path.splitext(video_id)[0]
1463 # it's tempting to parse this further, but you would
1464 # have to take into account all the variations like
1465 # Video Title - Site Name
1466 # Site Name | Video Title
1467 # Video Title - Tagline | Site Name
1468 # and so on and so forth; it's just not practical
1469 mobj = re.search(r'<title>(.*)</title>', webpage)
1471 self._downloader.report_error(u'unable to extract title')
1473 video_title = mobj.group(1)
1475 # video uploader is domain name
1476 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1478 self._downloader.report_error(u'unable to extract title')
1480 video_uploader = mobj.group(1)
1485 'uploader': video_uploader,
1486 'upload_date': None,
1487 'title': video_title,
1488 'ext': video_extension,
1492 class YoutubeSearchIE(InfoExtractor):
1493 """Information Extractor for YouTube search queries."""
1494 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1495 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1496 _max_youtube_results = 1000
1497 IE_NAME = u'youtube:search'
1499 def __init__(self, downloader=None):
1500 InfoExtractor.__init__(self, downloader)
1502 def report_download_page(self, query, pagenum):
1503 """Report attempt to download search page with given number."""
1504 query = query.decode(preferredencoding())
1505 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1507 def _real_extract(self, query):
1508 mobj = re.match(self._VALID_URL, query)
1510 self._downloader.report_error(u'invalid search query "%s"' % query)
1513 prefix, query = query.split(':')
1515 query = query.encode('utf-8')
1517 return self._get_n_results(query, 1)
1518 elif prefix == 'all':
1519 self._get_n_results(query, self._max_youtube_results)
1524 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1526 elif n > self._max_youtube_results:
1527 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1528 n = self._max_youtube_results
1529 return self._get_n_results(query, n)
1530 except ValueError: # parsing prefix as integer fails
1531 return self._get_n_results(query, 1)
1533 def _get_n_results(self, query, n):
1534 """Get a specified number of results for a query"""
1540 while (50 * pagenum) < limit:
1541 self.report_download_page(query, pagenum+1)
1542 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1543 request = compat_urllib_request.Request(result_url)
1545 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1546 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1547 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1549 api_response = json.loads(data)['data']
1551 if not 'items' in api_response:
1552 self._downloader.trouble(u'[youtube] No video results')
1555 new_ids = list(video['id'] for video in api_response['items'])
1556 video_ids += new_ids
1558 limit = min(n, api_response['totalItems'])
1561 if len(video_ids) > n:
1562 video_ids = video_ids[:n]
1563 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1567 class GoogleSearchIE(InfoExtractor):
1568 """Information Extractor for Google Video search queries."""
1569 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1570 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1571 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1572 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1573 _max_google_results = 1000
1574 IE_NAME = u'video.google:search'
1576 def __init__(self, downloader=None):
1577 InfoExtractor.__init__(self, downloader)
1579 def report_download_page(self, query, pagenum):
1580 """Report attempt to download playlist page with given number."""
1581 query = query.decode(preferredencoding())
1582 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1584 def _real_extract(self, query):
1585 mobj = re.match(self._VALID_URL, query)
1587 self._downloader.report_error(u'invalid search query "%s"' % query)
1590 prefix, query = query.split(':')
1592 query = query.encode('utf-8')
1594 self._download_n_results(query, 1)
1596 elif prefix == 'all':
1597 self._download_n_results(query, self._max_google_results)
1603 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1605 elif n > self._max_google_results:
1606 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1607 n = self._max_google_results
1608 self._download_n_results(query, n)
1610 except ValueError: # parsing prefix as integer fails
1611 self._download_n_results(query, 1)
1614 def _download_n_results(self, query, n):
1615 """Downloads a specified number of results for a query"""
1621 self.report_download_page(query, pagenum)
1622 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1623 request = compat_urllib_request.Request(result_url)
1625 page = compat_urllib_request.urlopen(request).read()
1626 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1627 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1630 # Extract video identifiers
1631 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1632 video_id = mobj.group(1)
1633 if video_id not in video_ids:
1634 video_ids.append(video_id)
1635 if len(video_ids) == n:
1636 # Specified n videos reached
1637 for id in video_ids:
1638 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1641 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1642 for id in video_ids:
1643 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1646 pagenum = pagenum + 1
1649 class YahooSearchIE(InfoExtractor):
1650 """Information Extractor for Yahoo! Video search queries."""
1653 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1654 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1655 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1656 _MORE_PAGES_INDICATOR = r'\s*Next'
1657 _max_yahoo_results = 1000
1658 IE_NAME = u'video.yahoo:search'
1660 def __init__(self, downloader=None):
1661 InfoExtractor.__init__(self, downloader)
1663 def report_download_page(self, query, pagenum):
1664 """Report attempt to download playlist page with given number."""
1665 query = query.decode(preferredencoding())
1666 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1668 def _real_extract(self, query):
1669 mobj = re.match(self._VALID_URL, query)
1671 self._downloader.report_error(u'invalid search query "%s"' % query)
1674 prefix, query = query.split(':')
1676 query = query.encode('utf-8')
1678 self._download_n_results(query, 1)
1680 elif prefix == 'all':
1681 self._download_n_results(query, self._max_yahoo_results)
1687 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1689 elif n > self._max_yahoo_results:
1690 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1691 n = self._max_yahoo_results
1692 self._download_n_results(query, n)
1694 except ValueError: # parsing prefix as integer fails
1695 self._download_n_results(query, 1)
1698 def _download_n_results(self, query, n):
1699 """Downloads a specified number of results for a query"""
1702 already_seen = set()
1706 self.report_download_page(query, pagenum)
1707 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1708 request = compat_urllib_request.Request(result_url)
1710 page = compat_urllib_request.urlopen(request).read()
1711 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1712 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1715 # Extract video identifiers
1716 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1717 video_id = mobj.group(1)
1718 if video_id not in already_seen:
1719 video_ids.append(video_id)
1720 already_seen.add(video_id)
1721 if len(video_ids) == n:
1722 # Specified n videos reached
1723 for id in video_ids:
1724 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1727 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1728 for id in video_ids:
1729 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1732 pagenum = pagenum + 1
1735 class YoutubePlaylistIE(InfoExtractor):
1736 """Information Extractor for YouTube playlists."""
1738 _VALID_URL = r"""(?:
1743 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1744 \? (?:.*?&)*? (?:p|a|list)=
1747 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1750 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1752 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1754 IE_NAME = u'youtube:playlist'
1756 def __init__(self, downloader=None):
1757 InfoExtractor.__init__(self, downloader)
1760 def suitable(cls, url):
1761 """Receives a URL and returns True if suitable for this IE."""
1762 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1764 def report_download_page(self, playlist_id, pagenum):
1765 """Report attempt to download playlist page with given number."""
1766 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1768 def _real_extract(self, url):
1769 # Extract playlist id
1770 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1772 self._downloader.report_error(u'invalid url: %s' % url)
1775 # Download playlist videos from API
1776 playlist_id = mobj.group(1) or mobj.group(2)
1781 self.report_download_page(playlist_id, page_num)
1783 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1785 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1786 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1787 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1791 response = json.loads(page)
1792 except ValueError as err:
1793 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1796 if 'feed' not in response:
1797 self._downloader.report_error(u'Got a malformed response from YouTube API')
1799 if 'entry' not in response['feed']:
1800 # Number of videos is a multiple of self._MAX_RESULTS
1803 playlist_title = response['feed']['title']['$t']
1805 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1806 for entry in response['feed']['entry']
1807 if 'content' in entry ]
1809 if len(response['feed']['entry']) < self._MAX_RESULTS:
1813 videos = [v[1] for v in sorted(videos)]
1815 url_results = [self.url_result(url, 'Youtube') for url in videos]
1816 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1819 class YoutubeChannelIE(InfoExtractor):
1820 """Information Extractor for YouTube channels."""
1822 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1823 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1824 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1825 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1826 IE_NAME = u'youtube:channel'
1828 def report_download_page(self, channel_id, pagenum):
1829 """Report attempt to download channel page with given number."""
1830 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1832 def extract_videos_from_page(self, page):
1834 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1835 if mobj.group(1) not in ids_in_page:
1836 ids_in_page.append(mobj.group(1))
1839 def _real_extract(self, url):
1840 # Extract channel id
1841 mobj = re.match(self._VALID_URL, url)
1843 self._downloader.report_error(u'invalid url: %s' % url)
1846 # Download channel page
1847 channel_id = mobj.group(1)
1851 self.report_download_page(channel_id, pagenum)
1852 url = self._TEMPLATE_URL % (channel_id, pagenum)
1853 request = compat_urllib_request.Request(url)
1855 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1856 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1857 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1860 # Extract video identifiers
1861 ids_in_page = self.extract_videos_from_page(page)
1862 video_ids.extend(ids_in_page)
1864 # Download any subsequent channel pages using the json-based channel_ajax query
1865 if self._MORE_PAGES_INDICATOR in page:
1867 pagenum = pagenum + 1
1869 self.report_download_page(channel_id, pagenum)
1870 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1871 request = compat_urllib_request.Request(url)
1873 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1874 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1875 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1878 page = json.loads(page)
1880 ids_in_page = self.extract_videos_from_page(page['content_html'])
1881 video_ids.extend(ids_in_page)
1883 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1886 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1888 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1889 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1890 return [self.playlist_result(url_entries, channel_id)]
1893 class YoutubeUserIE(InfoExtractor):
1894 """Information Extractor for YouTube users."""
1896 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1897 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1898 _GDATA_PAGE_SIZE = 50
1899 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1900 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1901 IE_NAME = u'youtube:user'
1903 def __init__(self, downloader=None):
1904 InfoExtractor.__init__(self, downloader)
1906 def report_download_page(self, username, start_index):
1907 """Report attempt to download user page."""
1908 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1909 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1911 def _real_extract(self, url):
1913 mobj = re.match(self._VALID_URL, url)
1915 self._downloader.report_error(u'invalid url: %s' % url)
1918 username = mobj.group(1)
1920 # Download video ids using YouTube Data API. Result size per
1921 # query is limited (currently to 50 videos) so we need to query
1922 # page by page until there are no video ids - it means we got
1929 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1930 self.report_download_page(username, start_index)
1932 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1935 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1936 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1937 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1940 # Extract video identifiers
1943 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1944 if mobj.group(1) not in ids_in_page:
1945 ids_in_page.append(mobj.group(1))
1947 video_ids.extend(ids_in_page)
1949 # A little optimization - if current page is not
1950 # "full", ie. does not contain PAGE_SIZE video ids then
1951 # we can assume that this page is the last one - there
1952 # are no more ids on further pages - no need to query
1955 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1960 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1961 url_results = [self.url_result(url, 'Youtube') for url in urls]
1962 return [self.playlist_result(url_results, playlist_title = username)]
1965 class BlipTVUserIE(InfoExtractor):
1966 """Information Extractor for blip.tv users."""
1968 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1970 IE_NAME = u'blip.tv:user'
1972 def __init__(self, downloader=None):
1973 InfoExtractor.__init__(self, downloader)
1975 def report_download_page(self, username, pagenum):
1976 """Report attempt to download user page."""
1977 self.to_screen(u'user %s: Downloading video ids from page %d' %
1978 (username, pagenum))
1980 def _real_extract(self, url):
1982 mobj = re.match(self._VALID_URL, url)
1984 self._downloader.report_error(u'invalid url: %s' % url)
1987 username = mobj.group(1)
1989 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1991 request = compat_urllib_request.Request(url)
1994 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1995 mobj = re.search(r'data-users-id="([^"]+)"', page)
1996 page_base = page_base % mobj.group(1)
1997 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1998 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2002 # Download video ids using BlipTV Ajax calls. Result size per
2003 # query is limited (currently to 12 videos) so we need to query
2004 # page by page until there are no video ids - it means we got
2011 self.report_download_page(username, pagenum)
2012 url = page_base + "&page=" + str(pagenum)
2013 request = compat_urllib_request.Request( url )
2015 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
2016 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2017 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
2020 # Extract video identifiers
2023 for mobj in re.finditer(r'href="/([^"]+)"', page):
2024 if mobj.group(1) not in ids_in_page:
2025 ids_in_page.append(unescapeHTML(mobj.group(1)))
2027 video_ids.extend(ids_in_page)
2029 # A little optimization - if current page is not
2030 # "full", ie. does not contain PAGE_SIZE video ids then
2031 # we can assume that this page is the last one - there
2032 # are no more ids on further pages - no need to query
2035 if len(ids_in_page) < self._PAGE_SIZE:
2040 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
2041 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
2042 return [self.playlist_result(url_entries, playlist_title = username)]
2045 class DepositFilesIE(InfoExtractor):
2046 """Information extractor for depositfiles.com"""
2048 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2050 def report_download_webpage(self, file_id):
2051 """Report webpage download."""
2052 self.to_screen(u'%s: Downloading webpage' % file_id)
2054 def report_extraction(self, file_id):
2055 """Report information extraction."""
2056 self.to_screen(u'%s: Extracting information' % file_id)
2058 def _real_extract(self, url):
2059 file_id = url.split('/')[-1]
2060 # Rebuild url in english locale
2061 url = 'http://depositfiles.com/en/files/' + file_id
2063 # Retrieve file webpage with 'Free download' button pressed
2064 free_download_indication = { 'gateway_result' : '1' }
2065 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2067 self.report_download_webpage(file_id)
2068 webpage = compat_urllib_request.urlopen(request).read()
2069 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2070 self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2073 # Search for the real file URL
2074 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2075 if (mobj is None) or (mobj.group(1) is None):
2076 # Try to figure out reason of the error.
2077 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2078 if (mobj is not None) and (mobj.group(1) is not None):
2079 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2080 self._downloader.report_error(u'%s' % restriction_message)
2082 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2085 file_url = mobj.group(1)
2086 file_extension = os.path.splitext(file_url)[1][1:]
2088 # Search for file title
2089 mobj = re.search(r'<b title="(.*?)">', webpage)
2091 self._downloader.report_error(u'unable to extract title')
2093 file_title = mobj.group(1).decode('utf-8')
2096 'id': file_id.decode('utf-8'),
2097 'url': file_url.decode('utf-8'),
2099 'upload_date': None,
2100 'title': file_title,
2101 'ext': file_extension.decode('utf-8'),
2105 class FacebookIE(InfoExtractor):
2106 """Information Extractor for Facebook"""
2108 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2109 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2110 _NETRC_MACHINE = 'facebook'
2111 IE_NAME = u'facebook'
2113 def report_login(self):
2114 """Report attempt to log in."""
2115 self.to_screen(u'Logging in')
2117 def _real_initialize(self):
2118 if self._downloader is None:
2123 downloader_params = self._downloader.params
2125 # Attempt to use provided username and password or .netrc data
2126 if downloader_params.get('username', None) is not None:
2127 useremail = downloader_params['username']
2128 password = downloader_params['password']
2129 elif downloader_params.get('usenetrc', False):
2131 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2132 if info is not None:
2136 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2137 except (IOError, netrc.NetrcParseError) as err:
2138 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2141 if useremail is None:
2150 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2153 login_results = compat_urllib_request.urlopen(request).read()
2154 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2155 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2157 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2158 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2161 def _real_extract(self, url):
2162 mobj = re.match(self._VALID_URL, url)
2164 self._downloader.report_error(u'invalid URL: %s' % url)
2166 video_id = mobj.group('ID')
2168 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2169 webpage = self._download_webpage(url, video_id)
2171 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2172 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2173 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2175 raise ExtractorError(u'Cannot parse data')
2176 data = dict(json.loads(m.group(1)))
2177 params_raw = compat_urllib_parse.unquote(data['params'])
2178 params = json.loads(params_raw)
2179 video_data = params['video_data'][0]
2180 video_url = video_data.get('hd_src')
2182 video_url = video_data['sd_src']
2184 raise ExtractorError(u'Cannot find video URL')
2185 video_duration = int(video_data['video_duration'])
2186 thumbnail = video_data['thumbnail_src']
2188 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2190 raise ExtractorError(u'Cannot find title in webpage')
2191 video_title = unescapeHTML(m.group(1))
2195 'title': video_title,
2198 'duration': video_duration,
2199 'thumbnail': thumbnail,
2204 class BlipTVIE(InfoExtractor):
2205 """Information extractor for blip.tv"""
2207 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2208 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2209 IE_NAME = u'blip.tv'
2211 def report_extraction(self, file_id):
2212 """Report information extraction."""
2213 self.to_screen(u'%s: Extracting information' % file_id)
2215 def report_direct_download(self, title):
2216 """Report information extraction."""
2217 self.to_screen(u'%s: Direct download detected' % title)
2219 def _real_extract(self, url):
2220 mobj = re.match(self._VALID_URL, url)
2222 self._downloader.report_error(u'invalid URL: %s' % url)
2225 urlp = compat_urllib_parse_urlparse(url)
2226 if urlp.path.startswith('/play/'):
2227 request = compat_urllib_request.Request(url)
2228 response = compat_urllib_request.urlopen(request)
2229 redirecturl = response.geturl()
2230 rurlp = compat_urllib_parse_urlparse(redirecturl)
2231 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2232 url = 'http://blip.tv/a/a-' + file_id
2233 return self._real_extract(url)
2240 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2241 request = compat_urllib_request.Request(json_url)
2242 request.add_header('User-Agent', 'iTunes/10.6.1')
2243 self.report_extraction(mobj.group(1))
2246 urlh = compat_urllib_request.urlopen(request)
2247 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2248 basename = url.split('/')[-1]
2249 title,ext = os.path.splitext(basename)
2250 title = title.decode('UTF-8')
2251 ext = ext.replace('.', '')
2252 self.report_direct_download(title)
2257 'upload_date': None,
2262 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2263 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2264 if info is None: # Regular URL
2266 json_code_bytes = urlh.read()
2267 json_code = json_code_bytes.decode('utf-8')
2268 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2269 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2273 json_data = json.loads(json_code)
2274 if 'Post' in json_data:
2275 data = json_data['Post']
2279 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2280 video_url = data['media']['url']
2281 umobj = re.match(self._URL_EXT, video_url)
2283 raise ValueError('Can not determine filename extension')
2284 ext = umobj.group(1)
2287 'id': data['item_id'],
2289 'uploader': data['display_name'],
2290 'upload_date': upload_date,
2291 'title': data['title'],
2293 'format': data['media']['mimeType'],
2294 'thumbnail': data['thumbnailUrl'],
2295 'description': data['description'],
2296 'player_url': data['embedUrl'],
2297 'user_agent': 'iTunes/10.6.1',
2299 except (ValueError,KeyError) as err:
2300 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2306 class MyVideoIE(InfoExtractor):
2307 """Information Extractor for myvideo.de."""
2309 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2310 IE_NAME = u'myvideo'
2312 def __init__(self, downloader=None):
2313 InfoExtractor.__init__(self, downloader)
2315 def report_extraction(self, video_id):
2316 """Report information extraction."""
2317 self.to_screen(u'%s: Extracting information' % video_id)
2319 def _real_extract(self,url):
2320 mobj = re.match(self._VALID_URL, url)
2322 self._download.report_error(u'invalid URL: %s' % url)
2325 video_id = mobj.group(1)
2328 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2329 webpage = self._download_webpage(webpage_url, video_id)
2331 self.report_extraction(video_id)
2332 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2335 self._downloader.report_error(u'unable to extract media URL')
2337 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2339 mobj = re.search('<title>([^<]+)</title>', webpage)
2341 self._downloader.report_error(u'unable to extract title')
2344 video_title = mobj.group(1)
2350 'upload_date': None,
2351 'title': video_title,
2355 class ComedyCentralIE(InfoExtractor):
2356 """Information extractor for The Daily Show and Colbert Report """
2358 # urls can be abbreviations like :thedailyshow or :colbert
2359 # urls for episodes like:
2360 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2361 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2362 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2363 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2364 |(https?://)?(www\.)?
2365 (?P<showname>thedailyshow|colbertnation)\.com/
2366 (full-episodes/(?P<episode>.*)|
2368 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2369 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2372 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2374 _video_extensions = {
2382 _video_dimensions = {
2392 def suitable(cls, url):
2393 """Receives a URL and returns True if suitable for this IE."""
2394 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2396 def report_extraction(self, episode_id):
2397 self.to_screen(u'%s: Extracting information' % episode_id)
2399 def report_config_download(self, episode_id, media_id):
2400 self.to_screen(u'%s: Downloading configuration for %s' % (episode_id, media_id))
2402 def report_index_download(self, episode_id):
2403 self.to_screen(u'%s: Downloading show index' % episode_id)
2405 def _print_formats(self, formats):
2406 print('Available formats:')
2408 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2411 def _real_extract(self, url):
2412 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2414 self._downloader.report_error(u'invalid URL: %s' % url)
2417 if mobj.group('shortname'):
2418 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2419 url = u'http://www.thedailyshow.com/full-episodes/'
2421 url = u'http://www.colbertnation.com/full-episodes/'
2422 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2423 assert mobj is not None
2425 if mobj.group('clip'):
2426 if mobj.group('showname') == 'thedailyshow':
2427 epTitle = mobj.group('tdstitle')
2429 epTitle = mobj.group('cntitle')
2432 dlNewest = not mobj.group('episode')
2434 epTitle = mobj.group('showname')
2436 epTitle = mobj.group('episode')
2438 req = compat_urllib_request.Request(url)
2439 self.report_extraction(epTitle)
2441 htmlHandle = compat_urllib_request.urlopen(req)
2442 html = htmlHandle.read()
2443 webpage = html.decode('utf-8')
2444 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2445 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2448 url = htmlHandle.geturl()
2449 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2451 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2453 if mobj.group('episode') == '':
2454 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2456 epTitle = mobj.group('episode')
2458 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2460 if len(mMovieParams) == 0:
2461 # The Colbert Report embeds the information in a without
2462 # a URL prefix; so extract the alternate reference
2463 # and then add the URL prefix manually.
2465 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2466 if len(altMovieParams) == 0:
2467 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2470 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2472 uri = mMovieParams[0][1]
2473 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2474 self.report_index_download(epTitle)
2476 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2477 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2478 self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2483 idoc = xml.etree.ElementTree.fromstring(indexXml)
2484 itemEls = idoc.findall('.//item')
2485 for partNum,itemEl in enumerate(itemEls):
2486 mediaId = itemEl.findall('./guid')[0].text
2487 shortMediaId = mediaId.split(':')[-1]
2488 showId = mediaId.split(':')[-2].replace('.com', '')
2489 officialTitle = itemEl.findall('./title')[0].text
2490 officialDate = itemEl.findall('./pubDate')[0].text
2492 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2493 compat_urllib_parse.urlencode({'uri': mediaId}))
2494 configReq = compat_urllib_request.Request(configUrl)
2495 self.report_config_download(epTitle, shortMediaId)
2497 configXml = compat_urllib_request.urlopen(configReq).read()
2498 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2499 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2502 cdoc = xml.etree.ElementTree.fromstring(configXml)
2504 for rendition in cdoc.findall('.//rendition'):
2505 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2509 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2512 if self._downloader.params.get('listformats', None):
2513 self._print_formats([i[0] for i in turls])
2516 # For now, just pick the highest bitrate
2517 format,rtmp_video_url = turls[-1]
2519 # Get the format arg from the arg stream
2520 req_format = self._downloader.params.get('format', None)
2522 # Select format if we can find one
2525 format, rtmp_video_url = f, v
2528 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2530 raise ExtractorError(u'Cannot transform RTMP url')
2531 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2532 video_url = base + m.group('finalid')
2534 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2539 'upload_date': officialDate,
2544 'description': officialTitle,
2546 results.append(info)
2551 class EscapistIE(InfoExtractor):
2552 """Information extractor for The Escapist """
2554 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2555 IE_NAME = u'escapist'
2557 def report_extraction(self, showName):
2558 self.to_screen(u'%s: Extracting information' % showName)
2560 def report_config_download(self, showName):
2561 self.to_screen(u'%s: Downloading configuration' % showName)
2563 def _real_extract(self, url):
2564 mobj = re.match(self._VALID_URL, url)
2566 self._downloader.report_error(u'invalid URL: %s' % url)
2568 showName = mobj.group('showname')
2569 videoId = mobj.group('episode')
2571 self.report_extraction(showName)
2573 webPage = compat_urllib_request.urlopen(url)
2574 webPageBytes = webPage.read()
2575 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2576 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2577 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2578 self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2581 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2582 description = unescapeHTML(descMatch.group(1))
2583 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2584 imgUrl = unescapeHTML(imgMatch.group(1))
2585 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2586 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2587 configUrlMatch = re.search('config=(.*)$', playerUrl)
2588 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2590 self.report_config_download(showName)
2592 configJSON = compat_urllib_request.urlopen(configUrl)
2593 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2594 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2595 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2596 self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2599 # Technically, it's JavaScript, not JSON
2600 configJSON = configJSON.replace("'", '"')
2603 config = json.loads(configJSON)
2604 except (ValueError,) as err:
2605 self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2608 playlist = config['playlist']
2609 videoUrl = playlist[1]['url']
2614 'uploader': showName,
2615 'upload_date': None,
2618 'thumbnail': imgUrl,
2619 'description': description,
2620 'player_url': playerUrl,
2625 class CollegeHumorIE(InfoExtractor):
2626 """Information extractor for collegehumor.com"""
2629 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2630 IE_NAME = u'collegehumor'
2632 def report_manifest(self, video_id):
2633 """Report information extraction."""
2634 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2636 def report_extraction(self, video_id):
2637 """Report information extraction."""
2638 self.to_screen(u'%s: Extracting information' % video_id)
2640 def _real_extract(self, url):
2641 mobj = re.match(self._VALID_URL, url)
2643 self._downloader.report_error(u'invalid URL: %s' % url)
2645 video_id = mobj.group('videoid')
2650 'upload_date': None,
2653 self.report_extraction(video_id)
2654 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2656 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2657 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2658 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2661 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2663 videoNode = mdoc.findall('./video')[0]
2664 info['description'] = videoNode.findall('./description')[0].text
2665 info['title'] = videoNode.findall('./caption')[0].text
2666 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2667 manifest_url = videoNode.findall('./file')[0].text
2669 self._downloader.report_error(u'Invalid metadata XML file')
2672 manifest_url += '?hdcore=2.10.3'
2673 self.report_manifest(video_id)
2675 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2676 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2677 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2680 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2682 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2683 node_id = media_node.attrib['url']
2684 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2685 except IndexError as err:
2686 self._downloader.report_error(u'Invalid manifest file')
2689 url_pr = compat_urllib_parse_urlparse(manifest_url)
2690 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2697 class XVideosIE(InfoExtractor):
2698 """Information extractor for xvideos.com"""
2700 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2701 IE_NAME = u'xvideos'
2703 def report_extraction(self, video_id):
2704 """Report information extraction."""
2705 self.to_screen(u'%s: Extracting information' % video_id)
2707 def _real_extract(self, url):
2708 mobj = re.match(self._VALID_URL, url)
2710 self._downloader.report_error(u'invalid URL: %s' % url)
2712 video_id = mobj.group(1)
2714 webpage = self._download_webpage(url, video_id)
2716 self.report_extraction(video_id)
2720 mobj = re.search(r'flv_url=(.+?)&', webpage)
2722 self._downloader.report_error(u'unable to extract video url')
2724 video_url = compat_urllib_parse.unquote(mobj.group(1))
2728 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2730 self._downloader.report_error(u'unable to extract video title')
2732 video_title = mobj.group(1)
2735 # Extract video thumbnail
2736 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2738 self._downloader.report_error(u'unable to extract video thumbnail')
2740 video_thumbnail = mobj.group(0)
2746 'upload_date': None,
2747 'title': video_title,
2749 'thumbnail': video_thumbnail,
2750 'description': None,
2756 class SoundcloudIE(InfoExtractor):
2757 """Information extractor for soundcloud.com
2758 To access the media, the uid of the song and a stream token
2759 must be extracted from the page source and the script must make
2760 a request to media.soundcloud.com/crossdomain.xml. Then
2761 the media can be grabbed by requesting from an url composed
2762 of the stream token and uid
2765 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2766 IE_NAME = u'soundcloud'
2768 def __init__(self, downloader=None):
2769 InfoExtractor.__init__(self, downloader)
2771 def report_resolve(self, video_id):
2772 """Report information extraction."""
2773 self.to_screen(u'%s: Resolving id' % video_id)
2775 def report_extraction(self, video_id):
2776 """Report information extraction."""
2777 self.to_screen(u'%s: Retrieving stream' % video_id)
2779 def _real_extract(self, url):
2780 mobj = re.match(self._VALID_URL, url)
2782 self._downloader.report_error(u'invalid URL: %s' % url)
2785 # extract uploader (which is in the url)
2786 uploader = mobj.group(1)
2787 # extract simple title (uploader + slug of song title)
2788 slug_title = mobj.group(2)
2789 simple_title = uploader + u'-' + slug_title
2791 self.report_resolve('%s/%s' % (uploader, slug_title))
2793 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2794 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2795 request = compat_urllib_request.Request(resolv_url)
2797 info_json_bytes = compat_urllib_request.urlopen(request).read()
2798 info_json = info_json_bytes.decode('utf-8')
2799 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2800 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2803 info = json.loads(info_json)
2804 video_id = info['id']
2805 self.report_extraction('%s/%s' % (uploader, slug_title))
2807 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2808 request = compat_urllib_request.Request(streams_url)
2810 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2811 stream_json = stream_json_bytes.decode('utf-8')
2812 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2813 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2816 streams = json.loads(stream_json)
2817 mediaURL = streams['http_mp3_128_url']
2822 'uploader': info['user']['username'],
2823 'upload_date': info['created_at'],
2824 'title': info['title'],
2826 'description': info['description'],
2829 class SoundcloudSetIE(InfoExtractor):
2830 """Information extractor for soundcloud.com sets
2831 To access the media, the uid of the song and a stream token
2832 must be extracted from the page source and the script must make
2833 a request to media.soundcloud.com/crossdomain.xml. Then
2834 the media can be grabbed by requesting from an url composed
2835 of the stream token and uid
2838 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2839 IE_NAME = u'soundcloud'
2841 def __init__(self, downloader=None):
2842 InfoExtractor.__init__(self, downloader)
2844 def report_resolve(self, video_id):
2845 """Report information extraction."""
2846 self.to_screen(u'%s: Resolving id' % video_id)
2848 def report_extraction(self, video_id):
2849 """Report information extraction."""
2850 self.to_screen(u'%s: Retrieving stream' % video_id)
2852 def _real_extract(self, url):
2853 mobj = re.match(self._VALID_URL, url)
2855 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2858 # extract uploader (which is in the url)
2859 uploader = mobj.group(1)
2860 # extract simple title (uploader + slug of song title)
2861 slug_title = mobj.group(2)
2862 simple_title = uploader + u'-' + slug_title
2864 self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2866 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2867 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2868 request = compat_urllib_request.Request(resolv_url)
2870 info_json_bytes = compat_urllib_request.urlopen(request).read()
2871 info_json = info_json_bytes.decode('utf-8')
2872 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2873 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2877 info = json.loads(info_json)
2878 if 'errors' in info:
2879 for err in info['errors']:
2880 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2883 for track in info['tracks']:
2884 video_id = track['id']
2885 self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2887 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2888 request = compat_urllib_request.Request(streams_url)
2890 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2891 stream_json = stream_json_bytes.decode('utf-8')
2892 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2893 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2896 streams = json.loads(stream_json)
2897 mediaURL = streams['http_mp3_128_url']
2902 'uploader': track['user']['username'],
2903 'upload_date': track['created_at'],
2904 'title': track['title'],
2906 'description': track['description'],
2911 class InfoQIE(InfoExtractor):
2912 """Information extractor for infoq.com"""
2913 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2915 def report_extraction(self, video_id):
2916 """Report information extraction."""
2917 self.to_screen(u'%s: Extracting information' % video_id)
2919 def _real_extract(self, url):
2920 mobj = re.match(self._VALID_URL, url)
2922 self._downloader.report_error(u'invalid URL: %s' % url)
2925 webpage = self._download_webpage(url, video_id=url)
2926 self.report_extraction(url)
2929 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2931 self._downloader.report_error(u'unable to extract video url')
2933 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2934 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2937 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2939 self._downloader.report_error(u'unable to extract video title')
2941 video_title = mobj.group(1)
2943 # Extract description
2944 video_description = u'No description available.'
2945 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2946 if mobj is not None:
2947 video_description = mobj.group(1)
2949 video_filename = video_url.split('/')[-1]
2950 video_id, extension = video_filename.split('.')
2956 'upload_date': None,
2957 'title': video_title,
2958 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2960 'description': video_description,
2965 class MixcloudIE(InfoExtractor):
2966 """Information extractor for www.mixcloud.com"""
2968 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2969 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2970 IE_NAME = u'mixcloud'
2972 def __init__(self, downloader=None):
2973 InfoExtractor.__init__(self, downloader)
2975 def report_download_json(self, file_id):
2976 """Report JSON download."""
2977 self.to_screen(u'Downloading json')
2979 def report_extraction(self, file_id):
2980 """Report information extraction."""
2981 self.to_screen(u'%s: Extracting information' % file_id)
2983 def get_urls(self, jsonData, fmt, bitrate='best'):
2984 """Get urls from 'audio_formats' section in json"""
2987 bitrate_list = jsonData[fmt]
2988 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2989 bitrate = max(bitrate_list) # select highest
2991 url_list = jsonData[fmt][bitrate]
2992 except TypeError: # we have no bitrate info.
2993 url_list = jsonData[fmt]
2996 def check_urls(self, url_list):
2997 """Returns 1st active url from list"""
2998 for url in url_list:
3000 compat_urllib_request.urlopen(url)
3002 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3007 def _print_formats(self, formats):
3008 print('Available formats:')
3009 for fmt in formats.keys():
3010 for b in formats[fmt]:
3012 ext = formats[fmt][b][0]
3013 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
3014 except TypeError: # we have no bitrate info
3015 ext = formats[fmt][0]
3016 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
3019 def _real_extract(self, url):
3020 mobj = re.match(self._VALID_URL, url)
3022 self._downloader.report_error(u'invalid URL: %s' % url)
3024 # extract uploader & filename from url
3025 uploader = mobj.group(1).decode('utf-8')
3026 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
3028 # construct API request
3029 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
3030 # retrieve .json file with links to files
3031 request = compat_urllib_request.Request(file_url)
3033 self.report_download_json(file_url)
3034 jsonData = compat_urllib_request.urlopen(request).read()
3035 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3036 self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
3040 json_data = json.loads(jsonData)
3041 player_url = json_data['player_swf_url']
3042 formats = dict(json_data['audio_formats'])
3044 req_format = self._downloader.params.get('format', None)
3047 if self._downloader.params.get('listformats', None):
3048 self._print_formats(formats)
3051 if req_format is None or req_format == 'best':
3052 for format_param in formats.keys():
3053 url_list = self.get_urls(formats, format_param)
3055 file_url = self.check_urls(url_list)
3056 if file_url is not None:
3059 if req_format not in formats:
3060 self._downloader.report_error(u'format is not available')
3063 url_list = self.get_urls(formats, req_format)
3064 file_url = self.check_urls(url_list)
3065 format_param = req_format
3068 'id': file_id.decode('utf-8'),
3069 'url': file_url.decode('utf-8'),
3070 'uploader': uploader.decode('utf-8'),
3071 'upload_date': None,
3072 'title': json_data['name'],
3073 'ext': file_url.split('.')[-1].decode('utf-8'),
3074 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3075 'thumbnail': json_data['thumbnail_url'],
3076 'description': json_data['description'],
3077 'player_url': player_url.decode('utf-8'),
3080 class StanfordOpenClassroomIE(InfoExtractor):
3081 """Information extractor for Stanford's Open ClassRoom"""
3083 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3084 IE_NAME = u'stanfordoc'
3086 def report_download_webpage(self, objid):
3087 """Report information extraction."""
3088 self.to_screen(u'%s: Downloading webpage' % objid)
3090 def report_extraction(self, video_id):
3091 """Report information extraction."""
3092 self.to_screen(u'%s: Extracting information' % video_id)
3094 def _real_extract(self, url):
3095 mobj = re.match(self._VALID_URL, url)
3097 raise ExtractorError(u'Invalid URL: %s' % url)
3099 if mobj.group('course') and mobj.group('video'): # A specific video
3100 course = mobj.group('course')
3101 video = mobj.group('video')
3103 'id': course + '_' + video,
3105 'upload_date': None,
3108 self.report_extraction(info['id'])
3109 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3110 xmlUrl = baseUrl + video + '.xml'
3112 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3113 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3114 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3116 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3118 info['title'] = mdoc.findall('./title')[0].text
3119 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3121 self._downloader.report_error(u'Invalid metadata XML file')
3123 info['ext'] = info['url'].rpartition('.')[2]
3125 elif mobj.group('course'): # A course page
3126 course = mobj.group('course')
3131 'upload_date': None,
3134 coursepage = self._download_webpage(url, info['id'],
3135 note='Downloading course info page',
3136 errnote='Unable to download course info page')
3138 m = re.search('<h1>([^<]+)</h1>', coursepage)
3140 info['title'] = unescapeHTML(m.group(1))
3142 info['title'] = info['id']
3144 m = re.search('<description>([^<]+)</description>', coursepage)
3146 info['description'] = unescapeHTML(m.group(1))
3148 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3151 'type': 'reference',
3152 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3156 for entry in info['list']:
3157 assert entry['type'] == 'reference'
3158 results += self.extract(entry['url'])
3162 'id': 'Stanford OpenClassroom',
3165 'upload_date': None,
3168 self.report_download_webpage(info['id'])
3169 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3171 rootpage = compat_urllib_request.urlopen(rootURL).read()
3172 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3173 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3176 info['title'] = info['id']
3178 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3181 'type': 'reference',
3182 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3187 for entry in info['list']:
3188 assert entry['type'] == 'reference'
3189 results += self.extract(entry['url'])
3192 class MTVIE(InfoExtractor):
3193 """Information extractor for MTV.com"""
3195 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3198 def report_extraction(self, video_id):
3199 """Report information extraction."""
3200 self.to_screen(u'%s: Extracting information' % video_id)
3202 def _real_extract(self, url):
3203 mobj = re.match(self._VALID_URL, url)
3205 self._downloader.report_error(u'invalid URL: %s' % url)
3207 if not mobj.group('proto'):
3208 url = 'http://' + url
3209 video_id = mobj.group('videoid')
3211 webpage = self._download_webpage(url, video_id)
3213 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3215 self._downloader.report_error(u'unable to extract song name')
3217 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3218 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3220 self._downloader.report_error(u'unable to extract performer')
3222 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3223 video_title = performer + ' - ' + song_name
3225 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3227 self._downloader.report_error(u'unable to mtvn_uri')
3229 mtvn_uri = mobj.group(1)
3231 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3233 self._downloader.report_error(u'unable to extract content id')
3235 content_id = mobj.group(1)
3237 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3238 self.report_extraction(video_id)
3239 request = compat_urllib_request.Request(videogen_url)
3241 metadataXml = compat_urllib_request.urlopen(request).read()
3242 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3243 self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3246 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3247 renditions = mdoc.findall('.//rendition')
3249 # For now, always pick the highest quality.
3250 rendition = renditions[-1]
3253 _,_,ext = rendition.attrib['type'].partition('/')
3254 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3255 video_url = rendition.find('./src').text
3257 self._downloader.trouble('Invalid rendition field.')
3263 'uploader': performer,
3264 'upload_date': None,
3265 'title': video_title,
3273 class YoukuIE(InfoExtractor):
3274 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3276 def report_download_webpage(self, file_id):
3277 """Report webpage download."""
3278 self.to_screen(u'%s: Downloading webpage' % file_id)
3280 def report_extraction(self, file_id):
3281 """Report information extraction."""
3282 self.to_screen(u'%s: Extracting information' % file_id)
3285 nowTime = int(time.time() * 1000)
3286 random1 = random.randint(1000,1998)
3287 random2 = random.randint(1000,9999)
3289 return "%d%d%d" %(nowTime,random1,random2)
3291 def _get_file_ID_mix_string(self, seed):
3293 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3295 for i in range(len(source)):
3296 seed = (seed * 211 + 30031 ) % 65536
3297 index = math.floor(seed / 65536 * len(source) )
3298 mixed.append(source[int(index)])
3299 source.remove(source[int(index)])
3300 #return ''.join(mixed)
3303 def _get_file_id(self, fileId, seed):
3304 mixed = self._get_file_ID_mix_string(seed)
3305 ids = fileId.split('*')
3309 realId.append(mixed[int(ch)])
3310 return ''.join(realId)
3312 def _real_extract(self, url):
3313 mobj = re.match(self._VALID_URL, url)
3315 self._downloader.report_error(u'invalid URL: %s' % url)
3317 video_id = mobj.group('ID')
3319 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3321 request = compat_urllib_request.Request(info_url, None, std_headers)
3323 self.report_download_webpage(video_id)
3324 jsondata = compat_urllib_request.urlopen(request).read()
3325 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3326 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3329 self.report_extraction(video_id)
3331 jsonstr = jsondata.decode('utf-8')
3332 config = json.loads(jsonstr)
3334 video_title = config['data'][0]['title']
3335 seed = config['data'][0]['seed']
3337 format = self._downloader.params.get('format', None)
3338 supported_format = list(config['data'][0]['streamfileids'].keys())
3340 if format is None or format == 'best':
3341 if 'hd2' in supported_format:
3346 elif format == 'worst':
3354 fileid = config['data'][0]['streamfileids'][format]
3355 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3356 except (UnicodeDecodeError, ValueError, KeyError):
3357 self._downloader.report_error(u'unable to extract info section')
3361 sid = self._gen_sid()
3362 fileid = self._get_file_id(fileid, seed)
3364 #column 8,9 of fileid represent the segment number
3365 #fileid[7:9] should be changed
3366 for index, key in enumerate(keys):
3368 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3369 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3372 'id': '%s_part%02d' % (video_id, index),
3373 'url': download_url,
3375 'upload_date': None,
3376 'title': video_title,
3379 files_info.append(info)
3384 class XNXXIE(InfoExtractor):
3385 """Information extractor for xnxx.com"""
3387 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3389 VIDEO_URL_RE = r'flv_url=(.*?)&'
3390 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3391 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3393 def report_webpage(self, video_id):
3394 """Report information extraction"""
3395 self.to_screen(u'%s: Downloading webpage' % video_id)
3397 def report_extraction(self, video_id):
3398 """Report information extraction"""
3399 self.to_screen(u'%s: Extracting information' % video_id)
3401 def _real_extract(self, url):
3402 mobj = re.match(self._VALID_URL, url)
3404 self._downloader.report_error(u'invalid URL: %s' % url)
3406 video_id = mobj.group(1)
3408 self.report_webpage(video_id)
3410 # Get webpage content
3412 webpage_bytes = compat_urllib_request.urlopen(url).read()
3413 webpage = webpage_bytes.decode('utf-8')
3414 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3415 self._downloader.report_error(u'unable to download video webpage: %s' % err)
3418 result = re.search(self.VIDEO_URL_RE, webpage)
3420 self._downloader.report_error(u'unable to extract video url')
3422 video_url = compat_urllib_parse.unquote(result.group(1))
3424 result = re.search(self.VIDEO_TITLE_RE, webpage)
3426 self._downloader.report_error(u'unable to extract video title')
3428 video_title = result.group(1)
3430 result = re.search(self.VIDEO_THUMB_RE, webpage)
3432 self._downloader.report_error(u'unable to extract video thumbnail')
3434 video_thumbnail = result.group(1)
3440 'upload_date': None,
3441 'title': video_title,
3443 'thumbnail': video_thumbnail,
3444 'description': None,
3448 class GooglePlusIE(InfoExtractor):
3449 """Information extractor for plus.google.com."""
3451 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3452 IE_NAME = u'plus.google'
3454 def __init__(self, downloader=None):
3455 InfoExtractor.__init__(self, downloader)
3457 def report_extract_entry(self, url):
3458 """Report downloading extry"""
3459 self.to_screen(u'Downloading entry: %s' % url)
3461 def report_date(self, upload_date):
3462 """Report downloading extry"""
3463 self.to_screen(u'Entry date: %s' % upload_date)
3465 def report_uploader(self, uploader):
3466 """Report downloading extry"""
3467 self.to_screen(u'Uploader: %s' % uploader)
3469 def report_title(self, video_title):
3470 """Report downloading extry"""
3471 self.to_screen(u'Title: %s' % video_title)
3473 def report_extract_vid_page(self, video_page):
3474 """Report information extraction."""
3475 self.to_screen(u'Extracting video page: %s' % video_page)
3477 def _real_extract(self, url):
3478 # Extract id from URL
3479 mobj = re.match(self._VALID_URL, url)
3481 self._downloader.report_error(u'Invalid URL: %s' % url)
3484 post_url = mobj.group(0)
3485 video_id = mobj.group(1)
3487 video_extension = 'flv'
3489 # Step 1, Retrieve post webpage to extract further information
3490 self.report_extract_entry(post_url)
3491 request = compat_urllib_request.Request(post_url)
3493 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3494 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3495 self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3498 # Extract update date
3500 pattern = 'title="Timestamp">(.*?)</a>'
3501 mobj = re.search(pattern, webpage)
3503 upload_date = mobj.group(1)
3504 # Convert timestring to a format suitable for filename
3505 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3506 upload_date = upload_date.strftime('%Y%m%d')
3507 self.report_date(upload_date)
3511 pattern = r'rel\="author".*?>(.*?)</a>'
3512 mobj = re.search(pattern, webpage)
3514 uploader = mobj.group(1)
3515 self.report_uploader(uploader)
3518 # Get the first line for title
3520 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3521 mobj = re.search(pattern, webpage)
3523 video_title = mobj.group(1)
3524 self.report_title(video_title)
3526 # Step 2, Stimulate clicking the image box to launch video
3527 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3528 mobj = re.search(pattern, webpage)
3530 self._downloader.report_error(u'unable to extract video page URL')
3532 video_page = mobj.group(1)
3533 request = compat_urllib_request.Request(video_page)
3535 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3536 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3537 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3539 self.report_extract_vid_page(video_page)
3542 # Extract video links on video page
3543 """Extract video links of all sizes"""
3544 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3545 mobj = re.findall(pattern, webpage)
3547 self._downloader.report_error(u'unable to extract video links')
3549 # Sort in resolution
3550 links = sorted(mobj)
3552 # Choose the lowest of the sort, i.e. highest resolution
3553 video_url = links[-1]
3554 # Only get the url. The resolution part in the tuple has no use anymore
3555 video_url = video_url[-1]
3556 # Treat escaped \u0026 style hex
3558 video_url = video_url.decode("unicode_escape")
3559 except AttributeError: # Python 3
3560 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3566 'uploader': uploader,
3567 'upload_date': upload_date,
3568 'title': video_title,
3569 'ext': video_extension,
3572 class NBAIE(InfoExtractor):
3573 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3576 def _real_extract(self, url):
3577 mobj = re.match(self._VALID_URL, url)
3579 self._downloader.report_error(u'invalid URL: %s' % url)
3582 video_id = mobj.group(1)
3583 if video_id.endswith('/index.html'):
3584 video_id = video_id[:-len('/index.html')]
3586 webpage = self._download_webpage(url, video_id)
3588 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3589 def _findProp(rexp, default=None):
3590 m = re.search(rexp, webpage)
3592 return unescapeHTML(m.group(1))
3596 shortened_video_id = video_id.rpartition('/')[2]
3597 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3599 'id': shortened_video_id,
3603 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3604 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3608 class JustinTVIE(InfoExtractor):
3609 """Information extractor for justin.tv and twitch.tv"""
3610 # TODO: One broadcast may be split into multiple videos. The key
3611 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3612 # starts at 1 and increases. Can we treat all parts as one video?
3614 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3615 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3616 _JUSTIN_PAGE_LIMIT = 100
3617 IE_NAME = u'justin.tv'
3619 def report_extraction(self, file_id):
3620 """Report information extraction."""
3621 self.to_screen(u'%s: Extracting information' % file_id)
3623 def report_download_page(self, channel, offset):
3624 """Report attempt to download a single page of videos."""
3625 self.to_screen(u'%s: Downloading video information from %d to %d' %
3626 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3628 # Return count of items, list of *valid* items
3629 def _parse_page(self, url):
3631 urlh = compat_urllib_request.urlopen(url)
3632 webpage_bytes = urlh.read()
3633 webpage = webpage_bytes.decode('utf-8', 'ignore')
3634 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3635 self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3638 response = json.loads(webpage)
3639 if type(response) != list:
3640 error_text = response.get('error', 'unknown error')
3641 self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3644 for clip in response:
3645 video_url = clip['video_file_url']
3647 video_extension = os.path.splitext(video_url)[1][1:]
3648 video_date = re.sub('-', '', clip['start_time'][:10])
3649 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3650 video_id = clip['id']
3651 video_title = clip.get('title', video_id)
3655 'title': video_title,
3656 'uploader': clip.get('channel_name', video_uploader_id),
3657 'uploader_id': video_uploader_id,
3658 'upload_date': video_date,
3659 'ext': video_extension,
3661 return (len(response), info)
3663 def _real_extract(self, url):
3664 mobj = re.match(self._VALID_URL, url)
3666 self._downloader.report_error(u'invalid URL: %s' % url)
3669 api = 'http://api.justin.tv'
3670 video_id = mobj.group(mobj.lastindex)
3672 if mobj.lastindex == 1:
3674 api += '/channel/archives/%s.json'
3676 api += '/broadcast/by_archive/%s.json'
3677 api = api % (video_id,)
3679 self.report_extraction(video_id)
3683 limit = self._JUSTIN_PAGE_LIMIT
3686 self.report_download_page(video_id, offset)
3687 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3688 page_count, page_info = self._parse_page(page_url)
3689 info.extend(page_info)
3690 if not paged or page_count != limit:
3695 class FunnyOrDieIE(InfoExtractor):
3696 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3698 def _real_extract(self, url):
3699 mobj = re.match(self._VALID_URL, url)
3701 self._downloader.report_error(u'invalid URL: %s' % url)
3704 video_id = mobj.group('id')
3705 webpage = self._download_webpage(url, video_id)
3707 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3709 self._downloader.report_error(u'unable to find video information')
3710 video_url = unescapeHTML(m.group('url'))
3712 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3714 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3716 self._downloader.trouble(u'Cannot find video title')
3717 title = clean_html(m.group('title'))
3719 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3721 desc = unescapeHTML(m.group('desc'))
3730 'description': desc,
3734 class SteamIE(InfoExtractor):
3735 _VALID_URL = r"""http://store.steampowered.com/
3736 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3738 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3742 def suitable(cls, url):
3743 """Receives a URL and returns True if suitable for this IE."""
3744 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3746 def _real_extract(self, url):
3747 m = re.match(self._VALID_URL, url, re.VERBOSE)
3748 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3749 gameID = m.group('gameID')
3750 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3751 webpage = self._download_webpage(videourl, gameID)
3752 mweb = re.finditer(urlRE, webpage)
3753 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3754 titles = re.finditer(namesRE, webpage)
3755 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3756 thumbs = re.finditer(thumbsRE, webpage)
3758 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3759 video_id = vid.group('videoID')
3760 title = vtitle.group('videoName')
3761 video_url = vid.group('videoURL')
3762 video_thumb = thumb.group('thumbnail')
3764 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3769 'title': unescapeHTML(title),
3770 'thumbnail': video_thumb
3775 class UstreamIE(InfoExtractor):
3776 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3777 IE_NAME = u'ustream'
3779 def _real_extract(self, url):
3780 m = re.match(self._VALID_URL, url)
3781 video_id = m.group('videoID')
3782 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3783 webpage = self._download_webpage(url, video_id)
3784 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3785 title = m.group('title')
3786 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3787 uploader = m.group('uploader')
3793 'uploader': uploader
3797 class WorldStarHipHopIE(InfoExtractor):
3798 _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3799 IE_NAME = u'WorldStarHipHop'
3801 def _real_extract(self, url):
3802 _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3804 webpage_src = compat_urllib_request.urlopen(url).read()
3805 webpage_src = webpage_src.decode('utf-8')
3807 mobj = re.search(_src_url, webpage_src)
3809 m = re.match(self._VALID_URL, url)
3810 video_id = m.group('id')
3812 if mobj is not None:
3813 video_url = mobj.group()
3814 if 'mp4' in video_url:
3819 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3822 _title = r"""<title>(.*)</title>"""
3824 mobj = re.search(_title, webpage_src)
3826 if mobj is not None:
3827 title = mobj.group(1)
3829 title = 'World Start Hip Hop - %s' % time.ctime()
3831 _thumbnail = r"""rel="image_src" href="(.*)" />"""
3832 mobj = re.search(_thumbnail, webpage_src)
3834 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3835 if mobj is not None:
3836 thumbnail = mobj.group(1)
3838 _title = r"""candytitles.*>(.*)</span>"""
3839 mobj = re.search(_title, webpage_src)
3840 if mobj is not None:
3841 title = mobj.group(1)
3848 'thumbnail' : thumbnail,
3853 class RBMARadioIE(InfoExtractor):
3854 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3856 def _real_extract(self, url):
3857 m = re.match(self._VALID_URL, url)
3858 video_id = m.group('videoID')
3860 webpage = self._download_webpage(url, video_id)
3861 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3863 raise ExtractorError(u'Cannot find metadata')
3864 json_data = m.group(1)
3867 data = json.loads(json_data)
3868 except ValueError as e:
3869 raise ExtractorError(u'Invalid JSON: ' + str(e))
3871 video_url = data['akamai_url'] + '&cbr=256'
3872 url_parts = compat_urllib_parse_urlparse(video_url)
3873 video_ext = url_parts.path.rpartition('.')[2]
3878 'title': data['title'],
3879 'description': data.get('teaser_text'),
3880 'location': data.get('country_of_origin'),
3881 'uploader': data.get('host', {}).get('name'),
3882 'uploader_id': data.get('host', {}).get('slug'),
3883 'thumbnail': data.get('image', {}).get('large_url_2x'),
3884 'duration': data.get('duration'),
3889 class YouPornIE(InfoExtractor):
3890 """Information extractor for youporn.com."""
3891 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3893 def _print_formats(self, formats):
3894 """Print all available formats"""
3895 print(u'Available formats:')
3896 print(u'ext\t\tformat')
3897 print(u'---------------------------------')
3898 for format in formats:
3899 print(u'%s\t\t%s' % (format['ext'], format['format']))
3901 def _specific(self, req_format, formats):
3903 if(x["format"]==req_format):
3907 def _real_extract(self, url):
3908 mobj = re.match(self._VALID_URL, url)
3910 self._downloader.report_error(u'invalid URL: %s' % url)
3913 video_id = mobj.group('videoid')
3915 req = compat_urllib_request.Request(url)
3916 req.add_header('Cookie', 'age_verified=1')
3917 webpage = self._download_webpage(req, video_id)
3919 # Get the video title
3920 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3922 raise ExtractorError(u'Unable to extract video title')
3923 video_title = result.group('title').strip()
3925 # Get the video date
3926 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3928 self._downloader.report_warning(u'unable to extract video date')
3931 upload_date = result.group('date').strip()
3933 # Get the video uploader
3934 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3936 self._downloader.report_warning(u'unable to extract uploader')
3937 video_uploader = None
3939 video_uploader = result.group('uploader').strip()
3940 video_uploader = clean_html( video_uploader )
3942 # Get all of the formats available
3943 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3944 result = re.search(DOWNLOAD_LIST_RE, webpage)
3946 raise ExtractorError(u'Unable to extract download list')
3947 download_list_html = result.group('download_list').strip()
3949 # Get all of the links from the page
3950 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3951 links = re.findall(LINK_RE, download_list_html)
3952 if(len(links) == 0):
3953 raise ExtractorError(u'ERROR: no known formats available for video')
3955 self.to_screen(u'Links found: %d' % len(links))
3960 # A link looks like this:
3961 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3962 # A path looks like this:
3963 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3964 video_url = unescapeHTML( link )
3965 path = compat_urllib_parse_urlparse( video_url ).path
3966 extension = os.path.splitext( path )[1][1:]
3967 format = path.split('/')[4].split('_')[:2]
3970 format = "-".join( format )
3971 title = u'%s-%s-%s' % (video_title, size, bitrate)
3976 'uploader': video_uploader,
3977 'upload_date': upload_date,
3982 'description': None,
3986 if self._downloader.params.get('listformats', None):
3987 self._print_formats(formats)
3990 req_format = self._downloader.params.get('format', None)
3991 self.to_screen(u'Format: %s' % req_format)
3993 if req_format is None or req_format == 'best':
3995 elif req_format == 'worst':
3996 return [formats[-1]]
3997 elif req_format in ('-1', 'all'):
4000 format = self._specific( req_format, formats )
4002 self._downloader.report_error(u'requested format not available')
4008 class PornotubeIE(InfoExtractor):
4009 """Information extractor for pornotube.com."""
4010 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
4012 def _real_extract(self, url):
4013 mobj = re.match(self._VALID_URL, url)
4015 self._downloader.report_error(u'invalid URL: %s' % url)
4018 video_id = mobj.group('videoid')
4019 video_title = mobj.group('title')
4021 # Get webpage content
4022 webpage = self._download_webpage(url, video_id)
4025 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
4026 result = re.search(VIDEO_URL_RE, webpage)
4028 self._downloader.report_error(u'unable to extract video url')
4030 video_url = compat_urllib_parse.unquote(result.group('url'))
4032 #Get the uploaded date
4033 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
4034 result = re.search(VIDEO_UPLOADED_RE, webpage)
4036 self._downloader.report_error(u'unable to extract video title')
4038 upload_date = result.group('date')
4040 info = {'id': video_id,
4043 'upload_date': upload_date,
4044 'title': video_title,
4050 class YouJizzIE(InfoExtractor):
4051 """Information extractor for youjizz.com."""
4052 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
4054 def _real_extract(self, url):
4055 mobj = re.match(self._VALID_URL, url)
4057 self._downloader.report_error(u'invalid URL: %s' % url)
4060 video_id = mobj.group('videoid')
4062 # Get webpage content
4063 webpage = self._download_webpage(url, video_id)
4065 # Get the video title
4066 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
4068 raise ExtractorError(u'ERROR: unable to extract video title')
4069 video_title = result.group('title').strip()
4071 # Get the embed page
4072 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
4074 raise ExtractorError(u'ERROR: unable to extract embed page')
4076 embed_page_url = result.group(0).strip()
4077 video_id = result.group('videoid')
4079 webpage = self._download_webpage(embed_page_url, video_id)
4082 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
4084 raise ExtractorError(u'ERROR: unable to extract video url')
4085 video_url = result.group('source')
4087 info = {'id': video_id,
4089 'title': video_title,
4092 'player_url': embed_page_url}
4096 class EightTracksIE(InfoExtractor):
4098 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4100 def _real_extract(self, url):
4101 mobj = re.match(self._VALID_URL, url)
4103 raise ExtractorError(u'Invalid URL: %s' % url)
4104 playlist_id = mobj.group('id')
4106 webpage = self._download_webpage(url, playlist_id)
4108 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4110 raise ExtractorError(u'Cannot find trax information')
4111 json_like = m.group(1)
4112 data = json.loads(json_like)
4114 session = str(random.randint(0, 1000000000))
4116 track_count = data['tracks_count']
4117 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4118 next_url = first_url
4120 for i in itertools.count():
4121 api_json = self._download_webpage(next_url, playlist_id,
4122 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4123 errnote=u'Failed to download song information')
4124 api_data = json.loads(api_json)
4125 track_data = api_data[u'set']['track']
4127 'id': track_data['id'],
4128 'url': track_data['track_file_stream_url'],
4129 'title': track_data['performer'] + u' - ' + track_data['name'],
4130 'raw_title': track_data['name'],
4131 'uploader_id': data['user']['login'],
4135 if api_data['set']['at_last_track']:
4137 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4140 class KeekIE(InfoExtractor):
4141 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4144 def _real_extract(self, url):
4145 m = re.match(self._VALID_URL, url)
4146 video_id = m.group('videoID')
4147 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4148 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4149 webpage = self._download_webpage(url, video_id)
4150 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4151 title = unescapeHTML(m.group('title'))
4152 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4153 uploader = clean_html(m.group('uploader'))
4159 'thumbnail': thumbnail,
4160 'uploader': uploader
4164 class TEDIE(InfoExtractor):
4165 _VALID_URL=r'''http://www.ted.com/
4167 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4169 ((?P<type_talk>talks)) # We have a simple talk
4171 /(?P<name>\w+) # Here goes the name and then ".html"
4175 def suitable(cls, url):
4176 """Receives a URL and returns True if suitable for this IE."""
4177 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4179 def _real_extract(self, url):
4180 m=re.match(self._VALID_URL, url, re.VERBOSE)
4181 if m.group('type_talk'):
4182 return [self._talk_info(url)]
4184 playlist_id=m.group('playlist_id')
4185 name=m.group('name')
4186 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
4187 return [self._playlist_videos_info(url,name,playlist_id)]
4189 def _talk_video_link(self,mediaSlug):
4190 '''Returns the video link for that mediaSlug'''
4191 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4193 def _playlist_videos_info(self,url,name,playlist_id=0):
4194 '''Returns the videos of the playlist'''
4196 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4197 ([.\s]*?)data-playlist_item_id="(\d+)"
4198 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4200 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4201 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4202 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4203 m_names=re.finditer(video_name_RE,webpage)
4205 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
4206 m_playlist = re.search(playlist_RE, webpage)
4207 playlist_title = m_playlist.group('playlist_title')
4209 playlist_entries = []
4210 for m_video, m_name in zip(m_videos,m_names):
4211 video_id=m_video.group('video_id')
4212 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4213 playlist_entries.append(self.url_result(talk_url, 'TED'))
4214 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
4216 def _talk_info(self, url, video_id=0):
4217 """Return the video for the talk in the url"""
4218 m=re.match(self._VALID_URL, url,re.VERBOSE)
4219 videoName=m.group('name')
4220 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4221 # If the url includes the language we get the title translated
4222 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4223 title=re.search(title_RE, webpage).group('title')
4224 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4225 "id":(?P<videoID>[\d]+).*?
4226 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4227 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4228 thumb_match=re.search(thumb_RE,webpage)
4229 info_match=re.search(info_RE,webpage,re.VERBOSE)
4230 video_id=info_match.group('videoID')
4231 mediaSlug=info_match.group('mediaSlug')
4232 video_url=self._talk_video_link(mediaSlug)
4238 'thumbnail': thumb_match.group('thumbnail')
4242 class MySpassIE(InfoExtractor):
4243 _VALID_URL = r'http://www.myspass.de/.*'
4245 def _real_extract(self, url):
4246 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4248 # video id is the last path element of the URL
4249 # usually there is a trailing slash, so also try the second but last
4250 url_path = compat_urllib_parse_urlparse(url).path
4251 url_parent_path, video_id = os.path.split(url_path)
4253 _, video_id = os.path.split(url_parent_path)
4256 metadata_url = META_DATA_URL_TEMPLATE % video_id
4257 metadata_text = self._download_webpage(metadata_url, video_id)
4258 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4260 # extract values from metadata
4261 url_flv_el = metadata.find('url_flv')
4262 if url_flv_el is None:
4263 self._downloader.report_error(u'unable to extract download url')
4265 video_url = url_flv_el.text
4266 extension = os.path.splitext(video_url)[1][1:]
4267 title_el = metadata.find('title')
4268 if title_el is None:
4269 self._downloader.report_error(u'unable to extract title')
4271 title = title_el.text
4272 format_id_el = metadata.find('format_id')
4273 if format_id_el is None:
4276 format = format_id_el.text
4277 description_el = metadata.find('description')
4278 if description_el is not None:
4279 description = description_el.text
4282 imagePreview_el = metadata.find('imagePreview')
4283 if imagePreview_el is not None:
4284 thumbnail = imagePreview_el.text
4293 'thumbnail': thumbnail,
4294 'description': description
4298 class SpiegelIE(InfoExtractor):
4299 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4301 def _real_extract(self, url):
4302 m = re.match(self._VALID_URL, url)
4303 video_id = m.group('videoID')
4305 webpage = self._download_webpage(url, video_id)
4306 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4308 raise ExtractorError(u'Cannot find title')
4309 video_title = unescapeHTML(m.group(1))
4311 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4312 xml_code = self._download_webpage(xml_url, video_id,
4313 note=u'Downloading XML', errnote=u'Failed to download XML')
4315 idoc = xml.etree.ElementTree.fromstring(xml_code)
4316 last_type = idoc[-1]
4317 filename = last_type.findall('./filename')[0].text
4318 duration = float(last_type.findall('./duration')[0].text)
4320 video_url = 'http://video2.spiegel.de/flash/' + filename
4321 video_ext = filename.rpartition('.')[2]
4326 'title': video_title,
4327 'duration': duration,
4331 class LiveLeakIE(InfoExtractor):
4333 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4334 IE_NAME = u'liveleak'
4336 def _real_extract(self, url):
4337 mobj = re.match(self._VALID_URL, url)
4339 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4342 video_id = mobj.group('video_id')
4344 webpage = self._download_webpage(url, video_id)
4346 m = re.search(r'file: "(.*?)",', webpage)
4348 self._downloader.report_error(u'unable to find video url')
4350 video_url = m.group(1)
4352 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4354 self._downloader.trouble(u'Cannot find video title')
4355 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4357 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4359 desc = unescapeHTML(m.group('desc'))
4363 m = re.search(r'By:.*?(\w+)</a>', webpage)
4365 uploader = clean_html(m.group(1))
4374 'description': desc,
4375 'uploader': uploader
4380 class ARDIE(InfoExtractor):
4381 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4382 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4383 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4385 def _real_extract(self, url):
4386 # determine video id from url
4387 m = re.match(self._VALID_URL, url)
4389 numid = re.search(r'documentId=([0-9]+)', url)
4391 video_id = numid.group(1)
4393 video_id = m.group('video_id')
4395 # determine title and media streams from webpage
4396 html = self._download_webpage(url, video_id)
4397 title = re.search(self._TITLE, html).group('title')
4398 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4400 assert '"fsk"' in html
4401 self._downloader.report_error(u'this video is only available after 8:00 pm')
4404 # choose default media type and highest quality for now
4405 stream = max([s for s in streams if int(s["media_type"]) == 0],
4406 key=lambda s: int(s["quality"]))
4408 # there's two possibilities: RTMP stream or HTTP download
4409 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4410 if stream['rtmp_url']:
4411 self.to_screen(u'RTMP download detected')
4412 assert stream['video_url'].startswith('mp4:')
4413 info["url"] = stream["rtmp_url"]
4414 info["play_path"] = stream['video_url']
4416 assert stream["video_url"].endswith('.mp4')
4417 info["url"] = stream["video_url"]
4421 def gen_extractors():
4422 """ Return a list of an instance of every supported extractor.
4423 The order does matter; the first extractor matched is the one handling the URL.
4426 YoutubePlaylistIE(),
4451 StanfordOpenClassroomIE(),
4461 WorldStarHipHopIE(),
4477 def get_info_extractor(ie_name):
4478 """Returns the info extractor class with the given ie_name"""
4479 return globals()[ie_name+'IE']