2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 note = u'Downloading video webpage'
118 if note is not False:
119 self.to_screen(u'%s: %s' % (video_id, note))
121 return compat_urllib_request.urlopen(url_or_request)
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
124 errnote = u'Unable to download webpage'
125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
127 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns the data of the page as a string """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
133 encoding = m.group(1)
136 webpage_bytes = urlh.read()
137 if self._downloader.params.get('dump_intermediate_pages', False):
139 url = url_or_request.get_full_url()
140 except AttributeError:
142 self.to_screen(u'Dumping request to ' + url)
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
145 return webpage_bytes.decode(encoding, 'replace')
147 def to_screen(self, msg):
148 """Print msg to screen, prefixing it with '[ie_name]'"""
149 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
151 def report_extraction(self, id_or_name):
152 """Report information extraction."""
153 self.to_screen(u'%s: Extracting information' % id_or_name)
155 def report_age_confirmation(self):
156 """Report attempt to confirm age."""
157 self.to_screen(u'Confirming age')
159 #Methods for following #608
160 #They set the correct value of the '_type' key
161 def video_result(self, video_info):
162 """Returns a video"""
163 video_info['_type'] = 'video'
165 def url_result(self, url, ie=None):
166 """Returns a url that points to a page that should be processed"""
167 #TODO: ie should be the class used for getting the info
168 video_info = {'_type': 'url',
172 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
173 """Returns a playlist"""
174 video_info = {'_type': 'playlist',
177 video_info['id'] = playlist_id
179 video_info['title'] = playlist_title
183 class YoutubeIE(InfoExtractor):
184 """Information extractor for youtube.com."""
188 (?:https?://)? # http(s):// (optional)
189 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
190 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
191 (?:.*?\#/)? # handle anchor (#/) redirect urls
192 (?: # the various things that can precede the ID:
193 (?:(?:v|embed|e)/) # v/ or embed/ or e/
194 |(?: # or the v= param in all its forms
195 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
196 (?:\?|\#!?) # the params delimiter ? or # or #!
197 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
200 )? # optional -> youtube.com/xxxx is OK
201 )? # all until now is optional -> you can pass the naked ID
202 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
203 (?(1).+)? # if we found the ID, everything can follow
205 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
206 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
207 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
208 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
209 _NETRC_MACHINE = 'youtube'
210 # Listed in order of quality
211 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
212 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
213 _video_extensions = {
219 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
225 _video_dimensions = {
244 def suitable(cls, url):
245 """Receives a URL and returns True if suitable for this IE."""
246 if YoutubePlaylistIE.suitable(url): return False
247 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
249 def report_lang(self):
250 """Report attempt to set language."""
251 self.to_screen(u'Setting language')
253 def report_login(self):
254 """Report attempt to log in."""
255 self.to_screen(u'Logging in')
257 def report_video_webpage_download(self, video_id):
258 """Report attempt to download video webpage."""
259 self.to_screen(u'%s: Downloading video webpage' % video_id)
261 def report_video_info_webpage_download(self, video_id):
262 """Report attempt to download video info webpage."""
263 self.to_screen(u'%s: Downloading video info webpage' % video_id)
265 def report_video_subtitles_download(self, video_id):
266 """Report attempt to download video info webpage."""
267 self.to_screen(u'%s: Checking available subtitles' % video_id)
269 def report_video_subtitles_request(self, video_id, sub_lang, format):
270 """Report attempt to download video info webpage."""
271 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
273 def report_video_subtitles_available(self, video_id, sub_lang_list):
274 """Report available subtitles."""
275 sub_lang = ",".join(list(sub_lang_list.keys()))
276 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
278 def report_information_extraction(self, video_id):
279 """Report attempt to extract video information."""
280 self.to_screen(u'%s: Extracting video information' % video_id)
282 def report_unavailable_format(self, video_id, format):
283 """Report extracted video URL."""
284 self.to_screen(u'%s: Format %s not available' % (video_id, format))
286 def report_rtmp_download(self):
287 """Indicate the download will use the RTMP protocol."""
288 self.to_screen(u'RTMP download detected')
290 def _get_available_subtitles(self, video_id):
291 self.report_video_subtitles_download(video_id)
292 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
294 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
295 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
296 return (u'unable to download video subtitles: %s' % compat_str(err), None)
297 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
298 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
299 if not sub_lang_list:
300 return (u'video doesn\'t have subtitles', None)
303 def _list_available_subtitles(self, video_id):
304 sub_lang_list = self._get_available_subtitles(video_id)
305 self.report_video_subtitles_available(video_id, sub_lang_list)
307 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
310 (error_message, sub_lang, sub)
312 self.report_video_subtitles_request(video_id, sub_lang, format)
313 params = compat_urllib_parse.urlencode({
319 url = 'http://www.youtube.com/api/timedtext?' + params
321 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
322 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
323 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
325 return (u'Did not fetch video subtitles', None, None)
326 return (None, sub_lang, sub)
328 def _extract_subtitle(self, video_id):
330 Return a list with a tuple:
331 [(error_message, sub_lang, sub)]
333 sub_lang_list = self._get_available_subtitles(video_id)
334 sub_format = self._downloader.params.get('subtitlesformat')
335 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
336 return [(sub_lang_list[0], None, None)]
337 if self._downloader.params.get('subtitleslang', False):
338 sub_lang = self._downloader.params.get('subtitleslang')
339 elif 'en' in sub_lang_list:
342 sub_lang = list(sub_lang_list.keys())[0]
343 if not sub_lang in sub_lang_list:
344 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
346 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
349 def _extract_all_subtitles(self, video_id):
350 sub_lang_list = self._get_available_subtitles(video_id)
351 sub_format = self._downloader.params.get('subtitlesformat')
352 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
353 return [(sub_lang_list[0], None, None)]
355 for sub_lang in sub_lang_list:
356 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
357 subtitles.append(subtitle)
360 def _print_formats(self, formats):
361 print('Available formats:')
363 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
365 def _real_initialize(self):
366 if self._downloader is None:
371 downloader_params = self._downloader.params
373 # Attempt to use provided username and password or .netrc data
374 if downloader_params.get('username', None) is not None:
375 username = downloader_params['username']
376 password = downloader_params['password']
377 elif downloader_params.get('usenetrc', False):
379 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
384 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
385 except (IOError, netrc.NetrcParseError) as err:
386 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
390 request = compat_urllib_request.Request(self._LANG_URL)
393 compat_urllib_request.urlopen(request).read()
394 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
395 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
398 # No authentication to be performed
402 request = compat_urllib_request.Request(self._LOGIN_URL)
404 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
405 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
406 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
411 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
413 galx = match.group(1)
415 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
421 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
425 u'PersistentCookie': u'yes',
427 u'bgresponse': u'js_disabled',
428 u'checkConnection': u'',
429 u'checkedDomains': u'youtube',
435 u'signIn': u'Sign in',
437 u'service': u'youtube',
441 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
443 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
444 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
445 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
448 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
449 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
450 self._downloader.report_warning(u'unable to log in: bad username or password')
452 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
453 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
459 'action_confirm': 'Confirm',
461 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
463 self.report_age_confirmation()
464 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
465 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
466 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
469 def _extract_id(self, url):
470 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
472 self._downloader.report_error(u'invalid URL: %s' % url)
474 video_id = mobj.group(2)
477 def _real_extract(self, url):
478 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
479 mobj = re.search(self._NEXT_URL_RE, url)
481 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
482 video_id = self._extract_id(url)
485 self.report_video_webpage_download(video_id)
486 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
487 request = compat_urllib_request.Request(url)
489 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
490 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
491 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
494 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
496 # Attempt to extract SWF player URL
497 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
499 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
504 self.report_video_info_webpage_download(video_id)
505 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
506 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
507 % (video_id, el_type))
508 video_info_webpage = self._download_webpage(video_info_url, video_id,
510 errnote='unable to download video info webpage')
511 video_info = compat_parse_qs(video_info_webpage)
512 if 'token' in video_info:
514 if 'token' not in video_info:
515 if 'reason' in video_info:
516 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
518 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
521 # Check for "rental" videos
522 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
523 self._downloader.report_error(u'"rental" videos not supported')
526 # Start extracting information
527 self.report_information_extraction(video_id)
530 if 'author' not in video_info:
531 self._downloader.report_error(u'unable to extract uploader name')
533 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
536 video_uploader_id = None
537 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
539 video_uploader_id = mobj.group(1)
541 self._downloader.report_warning(u'unable to extract uploader nickname')
544 if 'title' not in video_info:
545 self._downloader.report_error(u'unable to extract video title')
547 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
550 if 'thumbnail_url' not in video_info:
551 self._downloader.report_warning(u'unable to extract video thumbnail')
553 else: # don't panic if we can't find it
554 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
558 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
560 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
561 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
562 for expression in format_expressions:
564 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
569 video_description = get_element_by_id("eow-description", video_webpage)
570 if video_description:
571 video_description = clean_html(video_description)
573 video_description = ''
576 video_subtitles = None
578 if self._downloader.params.get('writesubtitles', False):
579 video_subtitles = self._extract_subtitle(video_id)
581 (sub_error, sub_lang, sub) = video_subtitles[0]
583 self._downloader.report_error(sub_error)
585 if self._downloader.params.get('allsubtitles', False):
586 video_subtitles = self._extract_all_subtitles(video_id)
587 for video_subtitle in video_subtitles:
588 (sub_error, sub_lang, sub) = video_subtitle
590 self._downloader.report_error(sub_error)
592 if self._downloader.params.get('listsubtitles', False):
593 sub_lang_list = self._list_available_subtitles(video_id)
596 if 'length_seconds' not in video_info:
597 self._downloader.report_warning(u'unable to extract video duration')
600 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
603 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
605 # Decide which formats to download
606 req_format = self._downloader.params.get('format', None)
608 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
609 self.report_rtmp_download()
610 video_url_list = [(None, video_info['conn'][0])]
611 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
612 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
613 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
614 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
615 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
617 format_limit = self._downloader.params.get('format_limit', None)
618 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
619 if format_limit is not None and format_limit in available_formats:
620 format_list = available_formats[available_formats.index(format_limit):]
622 format_list = available_formats
623 existing_formats = [x for x in format_list if x in url_map]
624 if len(existing_formats) == 0:
625 raise ExtractorError(u'no known formats available for video')
626 if self._downloader.params.get('listformats', None):
627 self._print_formats(existing_formats)
629 if req_format is None or req_format == 'best':
630 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
631 elif req_format == 'worst':
632 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
633 elif req_format in ('-1', 'all'):
634 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
636 # Specific formats. We pick the first in a slash-delimeted sequence.
637 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
638 req_formats = req_format.split('/')
639 video_url_list = None
640 for rf in req_formats:
642 video_url_list = [(rf, url_map[rf])]
644 if video_url_list is None:
645 raise ExtractorError(u'requested format not available')
647 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
650 for format_param, video_real_url in video_url_list:
652 video_extension = self._video_extensions.get(format_param, 'flv')
654 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
655 self._video_dimensions.get(format_param, '???'))
659 'url': video_real_url,
660 'uploader': video_uploader,
661 'uploader_id': video_uploader_id,
662 'upload_date': upload_date,
663 'title': video_title,
664 'ext': video_extension,
665 'format': video_format,
666 'thumbnail': video_thumbnail,
667 'description': video_description,
668 'player_url': player_url,
669 'subtitles': video_subtitles,
670 'duration': video_duration
675 class MetacafeIE(InfoExtractor):
676 """Information Extractor for metacafe.com."""
678 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
679 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
680 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
681 IE_NAME = u'metacafe'
683 def __init__(self, downloader=None):
684 InfoExtractor.__init__(self, downloader)
686 def report_disclaimer(self):
687 """Report disclaimer retrieval."""
688 self.to_screen(u'Retrieving disclaimer')
690 def report_download_webpage(self, video_id):
691 """Report webpage download."""
692 self.to_screen(u'%s: Downloading webpage' % video_id)
694 def _real_initialize(self):
695 # Retrieve disclaimer
696 request = compat_urllib_request.Request(self._DISCLAIMER)
698 self.report_disclaimer()
699 disclaimer = compat_urllib_request.urlopen(request).read()
700 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
701 self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
707 'submit': "Continue - I'm over 18",
709 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
711 self.report_age_confirmation()
712 disclaimer = compat_urllib_request.urlopen(request).read()
713 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
714 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
717 def _real_extract(self, url):
718 # Extract id and simplified title from URL
719 mobj = re.match(self._VALID_URL, url)
721 self._downloader.report_error(u'invalid URL: %s' % url)
724 video_id = mobj.group(1)
726 # Check if video comes from YouTube
727 mobj2 = re.match(r'^yt-(.*)$', video_id)
728 if mobj2 is not None:
729 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
731 # Retrieve video webpage to extract further information
732 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
734 # Extract URL, uploader and title from webpage
735 self.report_extraction(video_id)
736 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
738 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
739 video_extension = mediaURL[-3:]
741 # Extract gdaKey if available
742 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
746 gdaKey = mobj.group(1)
747 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
749 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
751 self._downloader.report_error(u'unable to extract media URL')
753 vardict = compat_parse_qs(mobj.group(1))
754 if 'mediaData' not in vardict:
755 self._downloader.report_error(u'unable to extract media URL')
757 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
759 self._downloader.report_error(u'unable to extract media URL')
761 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
762 video_extension = mediaURL[-3:]
763 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
765 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
767 self._downloader.report_error(u'unable to extract title')
769 video_title = mobj.group(1).decode('utf-8')
771 mobj = re.search(r'submitter=(.*?);', webpage)
773 self._downloader.report_error(u'unable to extract uploader nickname')
775 video_uploader = mobj.group(1)
778 'id': video_id.decode('utf-8'),
779 'url': video_url.decode('utf-8'),
780 'uploader': video_uploader.decode('utf-8'),
782 'title': video_title,
783 'ext': video_extension.decode('utf-8'),
787 class DailymotionIE(InfoExtractor):
788 """Information Extractor for Dailymotion"""
790 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
791 IE_NAME = u'dailymotion'
794 def __init__(self, downloader=None):
795 InfoExtractor.__init__(self, downloader)
797 def _real_extract(self, url):
798 # Extract id and simplified title from URL
799 mobj = re.match(self._VALID_URL, url)
801 self._downloader.report_error(u'invalid URL: %s' % url)
804 video_id = mobj.group(1).split('_')[0].split('?')[0]
806 video_extension = 'mp4'
808 # Retrieve video webpage to extract further information
809 request = compat_urllib_request.Request(url)
810 request.add_header('Cookie', 'family_filter=off')
811 webpage = self._download_webpage(request, video_id)
813 # Extract URL, uploader and title from webpage
814 self.report_extraction(video_id)
815 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
817 self._downloader.report_error(u'unable to extract media URL')
819 flashvars = compat_urllib_parse.unquote(mobj.group(1))
821 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
824 self.to_screen(u'Using %s' % key)
827 self._downloader.report_error(u'unable to extract video URL')
830 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
832 self._downloader.report_error(u'unable to extract video URL')
835 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
837 # TODO: support choosing qualities
839 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
841 self._downloader.report_error(u'unable to extract title')
843 video_title = unescapeHTML(mobj.group('title'))
845 video_uploader = None
846 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
848 # lookin for official user
849 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
850 if mobj_official is None:
851 self._downloader.report_warning(u'unable to extract uploader nickname')
853 video_uploader = mobj_official.group(1)
855 video_uploader = mobj.group(1)
857 video_upload_date = None
858 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
860 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
865 'uploader': video_uploader,
866 'upload_date': video_upload_date,
867 'title': video_title,
868 'ext': video_extension,
872 class PhotobucketIE(InfoExtractor):
873 """Information extractor for photobucket.com."""
875 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
876 IE_NAME = u'photobucket'
878 def __init__(self, downloader=None):
879 InfoExtractor.__init__(self, downloader)
881 def report_download_webpage(self, video_id):
882 """Report webpage download."""
883 self.to_screen(u'%s: Downloading webpage' % video_id)
885 def _real_extract(self, url):
886 # Extract id from URL
887 mobj = re.match(self._VALID_URL, url)
889 self._downloader.report_error(u'Invalid URL: %s' % url)
892 video_id = mobj.group(1)
894 video_extension = 'flv'
896 # Retrieve video webpage to extract further information
897 request = compat_urllib_request.Request(url)
899 self.report_download_webpage(video_id)
900 webpage = compat_urllib_request.urlopen(request).read()
901 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
902 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
905 # Extract URL, uploader, and title from webpage
906 self.report_extraction(video_id)
907 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
909 self._downloader.report_error(u'unable to extract media URL')
911 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
915 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
917 self._downloader.report_error(u'unable to extract title')
919 video_title = mobj.group(1).decode('utf-8')
921 video_uploader = mobj.group(2).decode('utf-8')
924 'id': video_id.decode('utf-8'),
925 'url': video_url.decode('utf-8'),
926 'uploader': video_uploader,
928 'title': video_title,
929 'ext': video_extension.decode('utf-8'),
933 class YahooIE(InfoExtractor):
934 """Information extractor for video.yahoo.com."""
937 # _VALID_URL matches all Yahoo! Video URLs
938 # _VPAGE_URL matches only the extractable '/watch/' URLs
939 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
940 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
941 IE_NAME = u'video.yahoo'
943 def __init__(self, downloader=None):
944 InfoExtractor.__init__(self, downloader)
946 def report_download_webpage(self, video_id):
947 """Report webpage download."""
948 self.to_screen(u'%s: Downloading webpage' % video_id)
950 def _real_extract(self, url, new_video=True):
951 # Extract ID from URL
952 mobj = re.match(self._VALID_URL, url)
954 self._downloader.report_error(u'Invalid URL: %s' % url)
957 video_id = mobj.group(2)
958 video_extension = 'flv'
960 # Rewrite valid but non-extractable URLs as
961 # extractable English language /watch/ URLs
962 if re.match(self._VPAGE_URL, url) is None:
963 request = compat_urllib_request.Request(url)
965 webpage = compat_urllib_request.urlopen(request).read()
966 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
967 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
970 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
972 self._downloader.report_error(u'Unable to extract id field')
974 yahoo_id = mobj.group(1)
976 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
978 self._downloader.report_error(u'Unable to extract vid field')
980 yahoo_vid = mobj.group(1)
982 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
983 return self._real_extract(url, new_video=False)
985 # Retrieve video webpage to extract further information
986 request = compat_urllib_request.Request(url)
988 self.report_download_webpage(video_id)
989 webpage = compat_urllib_request.urlopen(request).read()
990 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
991 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
994 # Extract uploader and title from webpage
995 self.report_extraction(video_id)
996 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
998 self._downloader.report_error(u'unable to extract video title')
1000 video_title = mobj.group(1).decode('utf-8')
1002 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1004 self._downloader.report_error(u'unable to extract video uploader')
1006 video_uploader = mobj.group(1).decode('utf-8')
1008 # Extract video thumbnail
1009 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1011 self._downloader.report_error(u'unable to extract video thumbnail')
1013 video_thumbnail = mobj.group(1).decode('utf-8')
1015 # Extract video description
1016 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1018 self._downloader.report_error(u'unable to extract video description')
1020 video_description = mobj.group(1).decode('utf-8')
1021 if not video_description:
1022 video_description = 'No description available.'
1024 # Extract video height and width
1025 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1027 self._downloader.report_error(u'unable to extract video height')
1029 yv_video_height = mobj.group(1)
1031 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1033 self._downloader.report_error(u'unable to extract video width')
1035 yv_video_width = mobj.group(1)
1037 # Retrieve video playlist to extract media URL
1038 # I'm not completely sure what all these options are, but we
1039 # seem to need most of them, otherwise the server sends a 401.
1040 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1041 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1042 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1043 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1044 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1046 self.report_download_webpage(video_id)
1047 webpage = compat_urllib_request.urlopen(request).read()
1048 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1049 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1052 # Extract media URL from playlist XML
1053 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1055 self._downloader.report_error(u'Unable to extract media URL')
1057 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1058 video_url = unescapeHTML(video_url)
1061 'id': video_id.decode('utf-8'),
1063 'uploader': video_uploader,
1064 'upload_date': None,
1065 'title': video_title,
1066 'ext': video_extension.decode('utf-8'),
1067 'thumbnail': video_thumbnail.decode('utf-8'),
1068 'description': video_description,
1072 class VimeoIE(InfoExtractor):
1073 """Information extractor for vimeo.com."""
1075 # _VALID_URL matches Vimeo URLs
1076 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1079 def __init__(self, downloader=None):
1080 InfoExtractor.__init__(self, downloader)
1082 def report_download_webpage(self, video_id):
1083 """Report webpage download."""
1084 self.to_screen(u'%s: Downloading webpage' % video_id)
1086 def _real_extract(self, url, new_video=True):
1087 # Extract ID from URL
1088 mobj = re.match(self._VALID_URL, url)
1090 self._downloader.report_error(u'Invalid URL: %s' % url)
1093 video_id = mobj.group('id')
1094 if not mobj.group('proto'):
1095 url = 'https://' + url
1096 if mobj.group('direct_link'):
1097 url = 'https://vimeo.com/' + video_id
1099 # Retrieve video webpage to extract further information
1100 request = compat_urllib_request.Request(url, None, std_headers)
1102 self.report_download_webpage(video_id)
1103 webpage_bytes = compat_urllib_request.urlopen(request).read()
1104 webpage = webpage_bytes.decode('utf-8')
1105 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1106 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1109 # Now we begin extracting as much information as we can from what we
1110 # retrieved. First we extract the information common to all extractors,
1111 # and latter we extract those that are Vimeo specific.
1112 self.report_extraction(video_id)
1114 # Extract the config JSON
1116 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1117 config = json.loads(config)
1119 self._downloader.report_error(u'unable to extract info section')
1123 video_title = config["video"]["title"]
1125 # Extract uploader and uploader_id
1126 video_uploader = config["video"]["owner"]["name"]
1127 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1129 # Extract video thumbnail
1130 video_thumbnail = config["video"]["thumbnail"]
1132 # Extract video description
1133 video_description = get_element_by_attribute("itemprop", "description", webpage)
1134 if video_description: video_description = clean_html(video_description)
1135 else: video_description = u''
1137 # Extract upload date
1138 video_upload_date = None
1139 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1140 if mobj is not None:
1141 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1143 # Vimeo specific: extract request signature and timestamp
1144 sig = config['request']['signature']
1145 timestamp = config['request']['timestamp']
1147 # Vimeo specific: extract video codec and quality information
1148 # First consider quality, then codecs, then take everything
1149 # TODO bind to format param
1150 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1151 files = { 'hd': [], 'sd': [], 'other': []}
1152 for codec_name, codec_extension in codecs:
1153 if codec_name in config["video"]["files"]:
1154 if 'hd' in config["video"]["files"][codec_name]:
1155 files['hd'].append((codec_name, codec_extension, 'hd'))
1156 elif 'sd' in config["video"]["files"][codec_name]:
1157 files['sd'].append((codec_name, codec_extension, 'sd'))
1159 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1161 for quality in ('hd', 'sd', 'other'):
1162 if len(files[quality]) > 0:
1163 video_quality = files[quality][0][2]
1164 video_codec = files[quality][0][0]
1165 video_extension = files[quality][0][1]
1166 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1169 self._downloader.report_error(u'no known codec found')
1172 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1173 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1178 'uploader': video_uploader,
1179 'uploader_id': video_uploader_id,
1180 'upload_date': video_upload_date,
1181 'title': video_title,
1182 'ext': video_extension,
1183 'thumbnail': video_thumbnail,
1184 'description': video_description,
1188 class ArteTvIE(InfoExtractor):
1189 """arte.tv information extractor."""
1191 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1192 _LIVE_URL = r'index-[0-9]+\.html$'
1194 IE_NAME = u'arte.tv'
1196 def __init__(self, downloader=None):
1197 InfoExtractor.__init__(self, downloader)
1199 def report_download_webpage(self, video_id):
1200 """Report webpage download."""
1201 self.to_screen(u'%s: Downloading webpage' % video_id)
1203 def fetch_webpage(self, url):
1204 request = compat_urllib_request.Request(url)
1206 self.report_download_webpage(url)
1207 webpage = compat_urllib_request.urlopen(request).read()
1208 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1209 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1211 except ValueError as err:
1212 self._downloader.report_error(u'Invalid URL: %s' % url)
1216 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1217 page = self.fetch_webpage(url)
1218 mobj = re.search(regex, page, regexFlags)
1222 self._downloader.report_error(u'Invalid URL: %s' % url)
1225 for (i, key, err) in matchTuples:
1226 if mobj.group(i) is None:
1227 self._downloader.trouble(err)
1230 info[key] = mobj.group(i)
1234 def extractLiveStream(self, url):
1235 video_lang = url.split('/')[-4]
1236 info = self.grep_webpage(
1238 r'src="(.*?/videothek_js.*?\.js)',
1241 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1244 http_host = url.split('/')[2]
1245 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1246 info = self.grep_webpage(
1248 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1249 '(http://.*?\.swf).*?' +
1253 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1254 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1255 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1258 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1260 def extractPlus7Stream(self, url):
1261 video_lang = url.split('/')[-3]
1262 info = self.grep_webpage(
1264 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1267 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1270 next_url = compat_urllib_parse.unquote(info.get('url'))
1271 info = self.grep_webpage(
1273 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1276 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1279 next_url = compat_urllib_parse.unquote(info.get('url'))
1281 info = self.grep_webpage(
1283 r'<video id="(.*?)".*?>.*?' +
1284 '<name>(.*?)</name>.*?' +
1285 '<dateVideo>(.*?)</dateVideo>.*?' +
1286 '<url quality="hd">(.*?)</url>',
1289 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1290 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1291 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1292 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1297 'id': info.get('id'),
1298 'url': compat_urllib_parse.unquote(info.get('url')),
1299 'uploader': u'arte.tv',
1300 'upload_date': info.get('date'),
1301 'title': info.get('title').decode('utf-8'),
1307 def _real_extract(self, url):
1308 video_id = url.split('/')[-1]
1309 self.report_extraction(video_id)
1311 if re.search(self._LIVE_URL, video_id) is not None:
1312 self.extractLiveStream(url)
1315 info = self.extractPlus7Stream(url)
1320 class GenericIE(InfoExtractor):
1321 """Generic last-resort information extractor."""
1324 IE_NAME = u'generic'
1326 def __init__(self, downloader=None):
1327 InfoExtractor.__init__(self, downloader)
1329 def report_download_webpage(self, video_id):
1330 """Report webpage download."""
1331 if not self._downloader.params.get('test', False):
1332 self._downloader.report_warning(u'Falling back on generic information extractor.')
1333 self.to_screen(u'%s: Downloading webpage' % video_id)
1335 def report_following_redirect(self, new_url):
1336 """Report information extraction."""
1337 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1339 def _test_redirect(self, url):
1340 """Check if it is a redirect, like url shorteners, in case return the new url."""
1341 class HeadRequest(compat_urllib_request.Request):
1342 def get_method(self):
1345 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1347 Subclass the HTTPRedirectHandler to make it use our
1348 HeadRequest also on the redirected URL
1350 def redirect_request(self, req, fp, code, msg, headers, newurl):
1351 if code in (301, 302, 303, 307):
1352 newurl = newurl.replace(' ', '%20')
1353 newheaders = dict((k,v) for k,v in req.headers.items()
1354 if k.lower() not in ("content-length", "content-type"))
1355 return HeadRequest(newurl,
1357 origin_req_host=req.get_origin_req_host(),
1360 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1362 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1364 Fallback to GET if HEAD is not allowed (405 HTTP error)
1366 def http_error_405(self, req, fp, code, msg, headers):
1370 newheaders = dict((k,v) for k,v in req.headers.items()
1371 if k.lower() not in ("content-length", "content-type"))
1372 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1374 origin_req_host=req.get_origin_req_host(),
1378 opener = compat_urllib_request.OpenerDirector()
1379 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1380 HTTPMethodFallback, HEADRedirectHandler,
1381 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1382 opener.add_handler(handler())
1384 response = opener.open(HeadRequest(url))
1385 new_url = response.geturl()
1390 self.report_following_redirect(new_url)
1393 def _real_extract(self, url):
1394 new_url = self._test_redirect(url)
1395 if new_url: return [self.url_result(new_url)]
1397 video_id = url.split('/')[-1]
1399 webpage = self._download_webpage(url, video_id)
1400 except ValueError as err:
1401 # since this is the last-resort InfoExtractor, if
1402 # this error is thrown, it'll be thrown here
1403 self._downloader.report_error(u'Invalid URL: %s' % url)
1406 self.report_extraction(video_id)
1407 # Start with something easy: JW Player in SWFObject
1408 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1410 # Broaden the search a little bit
1411 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1413 # Broaden the search a little bit: JWPlayer JS loader
1414 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1416 self._downloader.report_error(u'Invalid URL: %s' % url)
1419 # It's possible that one of the regexes
1420 # matched, but returned an empty group:
1421 if mobj.group(1) is None:
1422 self._downloader.report_error(u'Invalid URL: %s' % url)
1425 video_url = compat_urllib_parse.unquote(mobj.group(1))
1426 video_id = os.path.basename(video_url)
1428 # here's a fun little line of code for you:
1429 video_extension = os.path.splitext(video_id)[1][1:]
1430 video_id = os.path.splitext(video_id)[0]
1432 # it's tempting to parse this further, but you would
1433 # have to take into account all the variations like
1434 # Video Title - Site Name
1435 # Site Name | Video Title
1436 # Video Title - Tagline | Site Name
1437 # and so on and so forth; it's just not practical
1438 mobj = re.search(r'<title>(.*)</title>', webpage)
1440 self._downloader.report_error(u'unable to extract title')
1442 video_title = mobj.group(1)
1444 # video uploader is domain name
1445 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1447 self._downloader.report_error(u'unable to extract title')
1449 video_uploader = mobj.group(1)
1454 'uploader': video_uploader,
1455 'upload_date': None,
1456 'title': video_title,
1457 'ext': video_extension,
1461 class YoutubeSearchIE(InfoExtractor):
1462 """Information Extractor for YouTube search queries."""
1463 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1464 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1465 _max_youtube_results = 1000
1466 IE_NAME = u'youtube:search'
1468 def __init__(self, downloader=None):
1469 InfoExtractor.__init__(self, downloader)
1471 def report_download_page(self, query, pagenum):
1472 """Report attempt to download search page with given number."""
1473 query = query.decode(preferredencoding())
1474 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1476 def _real_extract(self, query):
1477 mobj = re.match(self._VALID_URL, query)
1479 self._downloader.report_error(u'invalid search query "%s"' % query)
1482 prefix, query = query.split(':')
1484 query = query.encode('utf-8')
1486 return self._get_n_results(query, 1)
1487 elif prefix == 'all':
1488 self._get_n_results(query, self._max_youtube_results)
1493 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1495 elif n > self._max_youtube_results:
1496 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1497 n = self._max_youtube_results
1498 return self._get_n_results(query, n)
1499 except ValueError: # parsing prefix as integer fails
1500 return self._get_n_results(query, 1)
1502 def _get_n_results(self, query, n):
1503 """Get a specified number of results for a query"""
1509 while (50 * pagenum) < limit:
1510 self.report_download_page(query, pagenum+1)
1511 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1512 request = compat_urllib_request.Request(result_url)
1514 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1515 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1516 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1518 api_response = json.loads(data)['data']
1520 if not 'items' in api_response:
1521 self._downloader.trouble(u'[youtube] No video results')
1524 new_ids = list(video['id'] for video in api_response['items'])
1525 video_ids += new_ids
1527 limit = min(n, api_response['totalItems'])
1530 if len(video_ids) > n:
1531 video_ids = video_ids[:n]
1532 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1536 class GoogleSearchIE(InfoExtractor):
1537 """Information Extractor for Google Video search queries."""
1538 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1539 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1540 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1541 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1542 _max_google_results = 1000
1543 IE_NAME = u'video.google:search'
1545 def __init__(self, downloader=None):
1546 InfoExtractor.__init__(self, downloader)
1548 def report_download_page(self, query, pagenum):
1549 """Report attempt to download playlist page with given number."""
1550 query = query.decode(preferredencoding())
1551 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1553 def _real_extract(self, query):
1554 mobj = re.match(self._VALID_URL, query)
1556 self._downloader.report_error(u'invalid search query "%s"' % query)
1559 prefix, query = query.split(':')
1561 query = query.encode('utf-8')
1563 self._download_n_results(query, 1)
1565 elif prefix == 'all':
1566 self._download_n_results(query, self._max_google_results)
1572 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1574 elif n > self._max_google_results:
1575 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1576 n = self._max_google_results
1577 self._download_n_results(query, n)
1579 except ValueError: # parsing prefix as integer fails
1580 self._download_n_results(query, 1)
1583 def _download_n_results(self, query, n):
1584 """Downloads a specified number of results for a query"""
1590 self.report_download_page(query, pagenum)
1591 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1592 request = compat_urllib_request.Request(result_url)
1594 page = compat_urllib_request.urlopen(request).read()
1595 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1596 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1599 # Extract video identifiers
1600 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1601 video_id = mobj.group(1)
1602 if video_id not in video_ids:
1603 video_ids.append(video_id)
1604 if len(video_ids) == n:
1605 # Specified n videos reached
1606 for id in video_ids:
1607 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1610 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1611 for id in video_ids:
1612 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1615 pagenum = pagenum + 1
1618 class YahooSearchIE(InfoExtractor):
1619 """Information Extractor for Yahoo! Video search queries."""
1622 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1623 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1624 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1625 _MORE_PAGES_INDICATOR = r'\s*Next'
1626 _max_yahoo_results = 1000
1627 IE_NAME = u'video.yahoo:search'
1629 def __init__(self, downloader=None):
1630 InfoExtractor.__init__(self, downloader)
1632 def report_download_page(self, query, pagenum):
1633 """Report attempt to download playlist page with given number."""
1634 query = query.decode(preferredencoding())
1635 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1637 def _real_extract(self, query):
1638 mobj = re.match(self._VALID_URL, query)
1640 self._downloader.report_error(u'invalid search query "%s"' % query)
1643 prefix, query = query.split(':')
1645 query = query.encode('utf-8')
1647 self._download_n_results(query, 1)
1649 elif prefix == 'all':
1650 self._download_n_results(query, self._max_yahoo_results)
1656 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1658 elif n > self._max_yahoo_results:
1659 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1660 n = self._max_yahoo_results
1661 self._download_n_results(query, n)
1663 except ValueError: # parsing prefix as integer fails
1664 self._download_n_results(query, 1)
1667 def _download_n_results(self, query, n):
1668 """Downloads a specified number of results for a query"""
1671 already_seen = set()
1675 self.report_download_page(query, pagenum)
1676 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1677 request = compat_urllib_request.Request(result_url)
1679 page = compat_urllib_request.urlopen(request).read()
1680 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1681 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1684 # Extract video identifiers
1685 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1686 video_id = mobj.group(1)
1687 if video_id not in already_seen:
1688 video_ids.append(video_id)
1689 already_seen.add(video_id)
1690 if len(video_ids) == n:
1691 # Specified n videos reached
1692 for id in video_ids:
1693 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1696 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1697 for id in video_ids:
1698 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1701 pagenum = pagenum + 1
1704 class YoutubePlaylistIE(InfoExtractor):
1705 """Information Extractor for YouTube playlists."""
1707 _VALID_URL = r"""(?:
1712 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1713 \? (?:.*?&)*? (?:p|a|list)=
1716 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1719 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1721 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1723 IE_NAME = u'youtube:playlist'
1725 def __init__(self, downloader=None):
1726 InfoExtractor.__init__(self, downloader)
1729 def suitable(cls, url):
1730 """Receives a URL and returns True if suitable for this IE."""
1731 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1733 def report_download_page(self, playlist_id, pagenum):
1734 """Report attempt to download playlist page with given number."""
1735 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1737 def _real_extract(self, url):
1738 # Extract playlist id
1739 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1741 self._downloader.report_error(u'invalid url: %s' % url)
1744 # Download playlist videos from API
1745 playlist_id = mobj.group(1) or mobj.group(2)
1750 self.report_download_page(playlist_id, page_num)
1752 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1754 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1755 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1756 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1760 response = json.loads(page)
1761 except ValueError as err:
1762 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1765 if 'feed' not in response:
1766 self._downloader.report_error(u'Got a malformed response from YouTube API')
1768 if 'entry' not in response['feed']:
1769 # Number of videos is a multiple of self._MAX_RESULTS
1772 playlist_title = response['feed']['title']['$t']
1774 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1775 for entry in response['feed']['entry']
1776 if 'content' in entry ]
1778 if len(response['feed']['entry']) < self._MAX_RESULTS:
1782 videos = [v[1] for v in sorted(videos)]
1784 url_results = [self.url_result(url, 'Youtube') for url in videos]
1785 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1788 class YoutubeChannelIE(InfoExtractor):
1789 """Information Extractor for YouTube channels."""
1791 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1792 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1793 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1794 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1795 IE_NAME = u'youtube:channel'
1797 def report_download_page(self, channel_id, pagenum):
1798 """Report attempt to download channel page with given number."""
1799 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1801 def extract_videos_from_page(self, page):
1803 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1804 if mobj.group(1) not in ids_in_page:
1805 ids_in_page.append(mobj.group(1))
1808 def _real_extract(self, url):
1809 # Extract channel id
1810 mobj = re.match(self._VALID_URL, url)
1812 self._downloader.report_error(u'invalid url: %s' % url)
1815 # Download channel page
1816 channel_id = mobj.group(1)
1820 self.report_download_page(channel_id, pagenum)
1821 url = self._TEMPLATE_URL % (channel_id, pagenum)
1822 request = compat_urllib_request.Request(url)
1824 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1825 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1826 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1829 # Extract video identifiers
1830 ids_in_page = self.extract_videos_from_page(page)
1831 video_ids.extend(ids_in_page)
1833 # Download any subsequent channel pages using the json-based channel_ajax query
1834 if self._MORE_PAGES_INDICATOR in page:
1836 pagenum = pagenum + 1
1838 self.report_download_page(channel_id, pagenum)
1839 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1840 request = compat_urllib_request.Request(url)
1842 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1843 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1844 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1847 page = json.loads(page)
1849 ids_in_page = self.extract_videos_from_page(page['content_html'])
1850 video_ids.extend(ids_in_page)
1852 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1855 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1857 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1858 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1859 return [self.playlist_result(url_entries, channel_id)]
1862 class YoutubeUserIE(InfoExtractor):
1863 """Information Extractor for YouTube users."""
1865 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1866 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1867 _GDATA_PAGE_SIZE = 50
1868 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1869 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1870 IE_NAME = u'youtube:user'
1872 def __init__(self, downloader=None):
1873 InfoExtractor.__init__(self, downloader)
1875 def report_download_page(self, username, start_index):
1876 """Report attempt to download user page."""
1877 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1878 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1880 def _real_extract(self, url):
1882 mobj = re.match(self._VALID_URL, url)
1884 self._downloader.report_error(u'invalid url: %s' % url)
1887 username = mobj.group(1)
1889 # Download video ids using YouTube Data API. Result size per
1890 # query is limited (currently to 50 videos) so we need to query
1891 # page by page until there are no video ids - it means we got
1898 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1899 self.report_download_page(username, start_index)
1901 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1904 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1905 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1906 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1909 # Extract video identifiers
1912 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1913 if mobj.group(1) not in ids_in_page:
1914 ids_in_page.append(mobj.group(1))
1916 video_ids.extend(ids_in_page)
1918 # A little optimization - if current page is not
1919 # "full", ie. does not contain PAGE_SIZE video ids then
1920 # we can assume that this page is the last one - there
1921 # are no more ids on further pages - no need to query
1924 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1929 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1930 url_results = [self.url_result(url, 'Youtube') for url in urls]
1931 return [self.playlist_result(url_results, playlist_title = username)]
1934 class BlipTVUserIE(InfoExtractor):
1935 """Information Extractor for blip.tv users."""
1937 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1939 IE_NAME = u'blip.tv:user'
1941 def __init__(self, downloader=None):
1942 InfoExtractor.__init__(self, downloader)
1944 def report_download_page(self, username, pagenum):
1945 """Report attempt to download user page."""
1946 self.to_screen(u'user %s: Downloading video ids from page %d' %
1947 (username, pagenum))
1949 def _real_extract(self, url):
1951 mobj = re.match(self._VALID_URL, url)
1953 self._downloader.report_error(u'invalid url: %s' % url)
1956 username = mobj.group(1)
1958 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1960 request = compat_urllib_request.Request(url)
1963 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1964 mobj = re.search(r'data-users-id="([^"]+)"', page)
1965 page_base = page_base % mobj.group(1)
1966 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1967 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1971 # Download video ids using BlipTV Ajax calls. Result size per
1972 # query is limited (currently to 12 videos) so we need to query
1973 # page by page until there are no video ids - it means we got
1980 self.report_download_page(username, pagenum)
1981 url = page_base + "&page=" + str(pagenum)
1982 request = compat_urllib_request.Request( url )
1984 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1985 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1986 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
1989 # Extract video identifiers
1992 for mobj in re.finditer(r'href="/([^"]+)"', page):
1993 if mobj.group(1) not in ids_in_page:
1994 ids_in_page.append(unescapeHTML(mobj.group(1)))
1996 video_ids.extend(ids_in_page)
1998 # A little optimization - if current page is not
1999 # "full", ie. does not contain PAGE_SIZE video ids then
2000 # we can assume that this page is the last one - there
2001 # are no more ids on further pages - no need to query
2004 if len(ids_in_page) < self._PAGE_SIZE:
2009 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
2010 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
2011 return [self.playlist_result(url_entries, playlist_title = username)]
2014 class DepositFilesIE(InfoExtractor):
2015 """Information extractor for depositfiles.com"""
2017 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2019 def report_download_webpage(self, file_id):
2020 """Report webpage download."""
2021 self.to_screen(u'%s: Downloading webpage' % file_id)
2023 def _real_extract(self, url):
2024 file_id = url.split('/')[-1]
2025 # Rebuild url in english locale
2026 url = 'http://depositfiles.com/en/files/' + file_id
2028 # Retrieve file webpage with 'Free download' button pressed
2029 free_download_indication = { 'gateway_result' : '1' }
2030 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2032 self.report_download_webpage(file_id)
2033 webpage = compat_urllib_request.urlopen(request).read()
2034 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2035 self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2038 # Search for the real file URL
2039 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2040 if (mobj is None) or (mobj.group(1) is None):
2041 # Try to figure out reason of the error.
2042 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2043 if (mobj is not None) and (mobj.group(1) is not None):
2044 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2045 self._downloader.report_error(u'%s' % restriction_message)
2047 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2050 file_url = mobj.group(1)
2051 file_extension = os.path.splitext(file_url)[1][1:]
2053 # Search for file title
2054 mobj = re.search(r'<b title="(.*?)">', webpage)
2056 self._downloader.report_error(u'unable to extract title')
2058 file_title = mobj.group(1).decode('utf-8')
2061 'id': file_id.decode('utf-8'),
2062 'url': file_url.decode('utf-8'),
2064 'upload_date': None,
2065 'title': file_title,
2066 'ext': file_extension.decode('utf-8'),
2070 class FacebookIE(InfoExtractor):
2071 """Information Extractor for Facebook"""
2073 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2074 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2075 _NETRC_MACHINE = 'facebook'
2076 IE_NAME = u'facebook'
2078 def report_login(self):
2079 """Report attempt to log in."""
2080 self.to_screen(u'Logging in')
2082 def _real_initialize(self):
2083 if self._downloader is None:
2088 downloader_params = self._downloader.params
2090 # Attempt to use provided username and password or .netrc data
2091 if downloader_params.get('username', None) is not None:
2092 useremail = downloader_params['username']
2093 password = downloader_params['password']
2094 elif downloader_params.get('usenetrc', False):
2096 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2097 if info is not None:
2101 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2102 except (IOError, netrc.NetrcParseError) as err:
2103 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2106 if useremail is None:
2115 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2118 login_results = compat_urllib_request.urlopen(request).read()
2119 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2120 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2123 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2126 def _real_extract(self, url):
2127 mobj = re.match(self._VALID_URL, url)
2129 self._downloader.report_error(u'invalid URL: %s' % url)
2131 video_id = mobj.group('ID')
2133 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2134 webpage = self._download_webpage(url, video_id)
2136 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2137 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2138 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2140 raise ExtractorError(u'Cannot parse data')
2141 data = dict(json.loads(m.group(1)))
2142 params_raw = compat_urllib_parse.unquote(data['params'])
2143 params = json.loads(params_raw)
2144 video_data = params['video_data'][0]
2145 video_url = video_data.get('hd_src')
2147 video_url = video_data['sd_src']
2149 raise ExtractorError(u'Cannot find video URL')
2150 video_duration = int(video_data['video_duration'])
2151 thumbnail = video_data['thumbnail_src']
2153 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2155 raise ExtractorError(u'Cannot find title in webpage')
2156 video_title = unescapeHTML(m.group(1))
2160 'title': video_title,
2163 'duration': video_duration,
2164 'thumbnail': thumbnail,
2169 class BlipTVIE(InfoExtractor):
2170 """Information extractor for blip.tv"""
2172 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2173 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2174 IE_NAME = u'blip.tv'
2176 def report_direct_download(self, title):
2177 """Report information extraction."""
2178 self.to_screen(u'%s: Direct download detected' % title)
2180 def _real_extract(self, url):
2181 mobj = re.match(self._VALID_URL, url)
2183 self._downloader.report_error(u'invalid URL: %s' % url)
2186 urlp = compat_urllib_parse_urlparse(url)
2187 if urlp.path.startswith('/play/'):
2188 request = compat_urllib_request.Request(url)
2189 response = compat_urllib_request.urlopen(request)
2190 redirecturl = response.geturl()
2191 rurlp = compat_urllib_parse_urlparse(redirecturl)
2192 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2193 url = 'http://blip.tv/a/a-' + file_id
2194 return self._real_extract(url)
2201 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2202 request = compat_urllib_request.Request(json_url)
2203 request.add_header('User-Agent', 'iTunes/10.6.1')
2204 self.report_extraction(mobj.group(1))
2207 urlh = compat_urllib_request.urlopen(request)
2208 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2209 basename = url.split('/')[-1]
2210 title,ext = os.path.splitext(basename)
2211 title = title.decode('UTF-8')
2212 ext = ext.replace('.', '')
2213 self.report_direct_download(title)
2218 'upload_date': None,
2223 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2224 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2225 if info is None: # Regular URL
2227 json_code_bytes = urlh.read()
2228 json_code = json_code_bytes.decode('utf-8')
2229 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2230 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2234 json_data = json.loads(json_code)
2235 if 'Post' in json_data:
2236 data = json_data['Post']
2240 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2241 video_url = data['media']['url']
2242 umobj = re.match(self._URL_EXT, video_url)
2244 raise ValueError('Can not determine filename extension')
2245 ext = umobj.group(1)
2248 'id': data['item_id'],
2250 'uploader': data['display_name'],
2251 'upload_date': upload_date,
2252 'title': data['title'],
2254 'format': data['media']['mimeType'],
2255 'thumbnail': data['thumbnailUrl'],
2256 'description': data['description'],
2257 'player_url': data['embedUrl'],
2258 'user_agent': 'iTunes/10.6.1',
2260 except (ValueError,KeyError) as err:
2261 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2267 class MyVideoIE(InfoExtractor):
2268 """Information Extractor for myvideo.de."""
2270 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2271 IE_NAME = u'myvideo'
2273 def __init__(self, downloader=None):
2274 InfoExtractor.__init__(self, downloader)
2276 def _real_extract(self,url):
2277 mobj = re.match(self._VALID_URL, url)
2279 self._download.report_error(u'invalid URL: %s' % url)
2282 video_id = mobj.group(1)
2285 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2286 webpage = self._download_webpage(webpage_url, video_id)
2288 self.report_extraction(video_id)
2289 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2292 self._downloader.report_error(u'unable to extract media URL')
2294 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2296 mobj = re.search('<title>([^<]+)</title>', webpage)
2298 self._downloader.report_error(u'unable to extract title')
2301 video_title = mobj.group(1)
2307 'upload_date': None,
2308 'title': video_title,
2312 class ComedyCentralIE(InfoExtractor):
2313 """Information extractor for The Daily Show and Colbert Report """
2315 # urls can be abbreviations like :thedailyshow or :colbert
2316 # urls for episodes like:
2317 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2318 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2319 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2320 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2321 |(https?://)?(www\.)?
2322 (?P<showname>thedailyshow|colbertnation)\.com/
2323 (full-episodes/(?P<episode>.*)|
2325 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2326 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2329 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2331 _video_extensions = {
2339 _video_dimensions = {
2349 def suitable(cls, url):
2350 """Receives a URL and returns True if suitable for this IE."""
2351 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2353 def report_config_download(self, episode_id, media_id):
2354 self.to_screen(u'%s: Downloading configuration for %s' % (episode_id, media_id))
2356 def report_index_download(self, episode_id):
2357 self.to_screen(u'%s: Downloading show index' % episode_id)
2359 def _print_formats(self, formats):
2360 print('Available formats:')
2362 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2365 def _real_extract(self, url):
2366 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2368 self._downloader.report_error(u'invalid URL: %s' % url)
2371 if mobj.group('shortname'):
2372 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2373 url = u'http://www.thedailyshow.com/full-episodes/'
2375 url = u'http://www.colbertnation.com/full-episodes/'
2376 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2377 assert mobj is not None
2379 if mobj.group('clip'):
2380 if mobj.group('showname') == 'thedailyshow':
2381 epTitle = mobj.group('tdstitle')
2383 epTitle = mobj.group('cntitle')
2386 dlNewest = not mobj.group('episode')
2388 epTitle = mobj.group('showname')
2390 epTitle = mobj.group('episode')
2392 req = compat_urllib_request.Request(url)
2393 self.report_extraction(epTitle)
2395 htmlHandle = compat_urllib_request.urlopen(req)
2396 html = htmlHandle.read()
2397 webpage = html.decode('utf-8')
2398 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2399 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2402 url = htmlHandle.geturl()
2403 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2405 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2407 if mobj.group('episode') == '':
2408 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2410 epTitle = mobj.group('episode')
2412 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2414 if len(mMovieParams) == 0:
2415 # The Colbert Report embeds the information in a without
2416 # a URL prefix; so extract the alternate reference
2417 # and then add the URL prefix manually.
2419 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2420 if len(altMovieParams) == 0:
2421 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2424 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2426 uri = mMovieParams[0][1]
2427 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2428 self.report_index_download(epTitle)
2430 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2431 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2432 self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2437 idoc = xml.etree.ElementTree.fromstring(indexXml)
2438 itemEls = idoc.findall('.//item')
2439 for partNum,itemEl in enumerate(itemEls):
2440 mediaId = itemEl.findall('./guid')[0].text
2441 shortMediaId = mediaId.split(':')[-1]
2442 showId = mediaId.split(':')[-2].replace('.com', '')
2443 officialTitle = itemEl.findall('./title')[0].text
2444 officialDate = itemEl.findall('./pubDate')[0].text
2446 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2447 compat_urllib_parse.urlencode({'uri': mediaId}))
2448 configReq = compat_urllib_request.Request(configUrl)
2449 self.report_config_download(epTitle, shortMediaId)
2451 configXml = compat_urllib_request.urlopen(configReq).read()
2452 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2453 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2456 cdoc = xml.etree.ElementTree.fromstring(configXml)
2458 for rendition in cdoc.findall('.//rendition'):
2459 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2463 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2466 if self._downloader.params.get('listformats', None):
2467 self._print_formats([i[0] for i in turls])
2470 # For now, just pick the highest bitrate
2471 format,rtmp_video_url = turls[-1]
2473 # Get the format arg from the arg stream
2474 req_format = self._downloader.params.get('format', None)
2476 # Select format if we can find one
2479 format, rtmp_video_url = f, v
2482 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2484 raise ExtractorError(u'Cannot transform RTMP url')
2485 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2486 video_url = base + m.group('finalid')
2488 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2493 'upload_date': officialDate,
2498 'description': officialTitle,
2500 results.append(info)
2505 class EscapistIE(InfoExtractor):
2506 """Information extractor for The Escapist """
2508 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2509 IE_NAME = u'escapist'
2511 def report_config_download(self, showName):
2512 self.to_screen(u'%s: Downloading configuration' % showName)
2514 def _real_extract(self, url):
2515 mobj = re.match(self._VALID_URL, url)
2517 self._downloader.report_error(u'invalid URL: %s' % url)
2519 showName = mobj.group('showname')
2520 videoId = mobj.group('episode')
2522 self.report_extraction(showName)
2524 webPage = compat_urllib_request.urlopen(url)
2525 webPageBytes = webPage.read()
2526 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2527 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2528 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2529 self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2532 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2533 description = unescapeHTML(descMatch.group(1))
2534 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2535 imgUrl = unescapeHTML(imgMatch.group(1))
2536 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2537 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2538 configUrlMatch = re.search('config=(.*)$', playerUrl)
2539 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2541 self.report_config_download(showName)
2543 configJSON = compat_urllib_request.urlopen(configUrl)
2544 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2545 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2546 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2547 self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2550 # Technically, it's JavaScript, not JSON
2551 configJSON = configJSON.replace("'", '"')
2554 config = json.loads(configJSON)
2555 except (ValueError,) as err:
2556 self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2559 playlist = config['playlist']
2560 videoUrl = playlist[1]['url']
2565 'uploader': showName,
2566 'upload_date': None,
2569 'thumbnail': imgUrl,
2570 'description': description,
2571 'player_url': playerUrl,
2576 class CollegeHumorIE(InfoExtractor):
2577 """Information extractor for collegehumor.com"""
2580 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2581 IE_NAME = u'collegehumor'
2583 def report_manifest(self, video_id):
2584 """Report information extraction."""
2585 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2587 def _real_extract(self, url):
2588 mobj = re.match(self._VALID_URL, url)
2590 self._downloader.report_error(u'invalid URL: %s' % url)
2592 video_id = mobj.group('videoid')
2597 'upload_date': None,
2600 self.report_extraction(video_id)
2601 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2603 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2604 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2605 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2608 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2610 videoNode = mdoc.findall('./video')[0]
2611 info['description'] = videoNode.findall('./description')[0].text
2612 info['title'] = videoNode.findall('./caption')[0].text
2613 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2614 manifest_url = videoNode.findall('./file')[0].text
2616 self._downloader.report_error(u'Invalid metadata XML file')
2619 manifest_url += '?hdcore=2.10.3'
2620 self.report_manifest(video_id)
2622 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2623 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2624 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2627 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2629 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2630 node_id = media_node.attrib['url']
2631 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2632 except IndexError as err:
2633 self._downloader.report_error(u'Invalid manifest file')
2636 url_pr = compat_urllib_parse_urlparse(manifest_url)
2637 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2644 class XVideosIE(InfoExtractor):
2645 """Information extractor for xvideos.com"""
2647 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2648 IE_NAME = u'xvideos'
2650 def _real_extract(self, url):
2651 mobj = re.match(self._VALID_URL, url)
2653 self._downloader.report_error(u'invalid URL: %s' % url)
2655 video_id = mobj.group(1)
2657 webpage = self._download_webpage(url, video_id)
2659 self.report_extraction(video_id)
2663 mobj = re.search(r'flv_url=(.+?)&', webpage)
2665 self._downloader.report_error(u'unable to extract video url')
2667 video_url = compat_urllib_parse.unquote(mobj.group(1))
2671 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2673 self._downloader.report_error(u'unable to extract video title')
2675 video_title = mobj.group(1)
2678 # Extract video thumbnail
2679 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2681 self._downloader.report_error(u'unable to extract video thumbnail')
2683 video_thumbnail = mobj.group(0)
2689 'upload_date': None,
2690 'title': video_title,
2692 'thumbnail': video_thumbnail,
2693 'description': None,
2699 class SoundcloudIE(InfoExtractor):
2700 """Information extractor for soundcloud.com
2701 To access the media, the uid of the song and a stream token
2702 must be extracted from the page source and the script must make
2703 a request to media.soundcloud.com/crossdomain.xml. Then
2704 the media can be grabbed by requesting from an url composed
2705 of the stream token and uid
2708 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2709 IE_NAME = u'soundcloud'
2711 def __init__(self, downloader=None):
2712 InfoExtractor.__init__(self, downloader)
2714 def report_resolve(self, video_id):
2715 """Report information extraction."""
2716 self.to_screen(u'%s: Resolving id' % video_id)
2718 def _real_extract(self, url):
2719 mobj = re.match(self._VALID_URL, url)
2721 self._downloader.report_error(u'invalid URL: %s' % url)
2724 # extract uploader (which is in the url)
2725 uploader = mobj.group(1)
2726 # extract simple title (uploader + slug of song title)
2727 slug_title = mobj.group(2)
2728 simple_title = uploader + u'-' + slug_title
2730 self.report_resolve('%s/%s' % (uploader, slug_title))
2732 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2733 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2734 request = compat_urllib_request.Request(resolv_url)
2736 info_json_bytes = compat_urllib_request.urlopen(request).read()
2737 info_json = info_json_bytes.decode('utf-8')
2738 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2739 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2742 info = json.loads(info_json)
2743 video_id = info['id']
2744 self.report_extraction('%s/%s' % (uploader, slug_title))
2746 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2747 request = compat_urllib_request.Request(streams_url)
2749 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2750 stream_json = stream_json_bytes.decode('utf-8')
2751 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2752 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2755 streams = json.loads(stream_json)
2756 mediaURL = streams['http_mp3_128_url']
2761 'uploader': info['user']['username'],
2762 'upload_date': info['created_at'],
2763 'title': info['title'],
2765 'description': info['description'],
2768 class SoundcloudSetIE(InfoExtractor):
2769 """Information extractor for soundcloud.com sets
2770 To access the media, the uid of the song and a stream token
2771 must be extracted from the page source and the script must make
2772 a request to media.soundcloud.com/crossdomain.xml. Then
2773 the media can be grabbed by requesting from an url composed
2774 of the stream token and uid
2777 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2778 IE_NAME = u'soundcloud'
2780 def __init__(self, downloader=None):
2781 InfoExtractor.__init__(self, downloader)
2783 def report_resolve(self, video_id):
2784 """Report information extraction."""
2785 self.to_screen(u'%s: Resolving id' % video_id)
2787 def _real_extract(self, url):
2788 mobj = re.match(self._VALID_URL, url)
2790 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2793 # extract uploader (which is in the url)
2794 uploader = mobj.group(1)
2795 # extract simple title (uploader + slug of song title)
2796 slug_title = mobj.group(2)
2797 simple_title = uploader + u'-' + slug_title
2799 self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2801 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2802 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2803 request = compat_urllib_request.Request(resolv_url)
2805 info_json_bytes = compat_urllib_request.urlopen(request).read()
2806 info_json = info_json_bytes.decode('utf-8')
2807 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2808 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2812 info = json.loads(info_json)
2813 if 'errors' in info:
2814 for err in info['errors']:
2815 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
2818 for track in info['tracks']:
2819 video_id = track['id']
2820 self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2822 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2823 request = compat_urllib_request.Request(streams_url)
2825 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2826 stream_json = stream_json_bytes.decode('utf-8')
2827 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2828 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2831 streams = json.loads(stream_json)
2832 mediaURL = streams['http_mp3_128_url']
2837 'uploader': track['user']['username'],
2838 'upload_date': track['created_at'],
2839 'title': track['title'],
2841 'description': track['description'],
2846 class InfoQIE(InfoExtractor):
2847 """Information extractor for infoq.com"""
2848 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2850 def _real_extract(self, url):
2851 mobj = re.match(self._VALID_URL, url)
2853 self._downloader.report_error(u'invalid URL: %s' % url)
2856 webpage = self._download_webpage(url, video_id=url)
2857 self.report_extraction(url)
2860 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2862 self._downloader.report_error(u'unable to extract video url')
2864 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2865 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2868 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2870 self._downloader.report_error(u'unable to extract video title')
2872 video_title = mobj.group(1)
2874 # Extract description
2875 video_description = u'No description available.'
2876 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2877 if mobj is not None:
2878 video_description = mobj.group(1)
2880 video_filename = video_url.split('/')[-1]
2881 video_id, extension = video_filename.split('.')
2887 'upload_date': None,
2888 'title': video_title,
2889 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2891 'description': video_description,
2896 class MixcloudIE(InfoExtractor):
2897 """Information extractor for www.mixcloud.com"""
2899 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2900 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2901 IE_NAME = u'mixcloud'
2903 def __init__(self, downloader=None):
2904 InfoExtractor.__init__(self, downloader)
2906 def report_download_json(self, file_id):
2907 """Report JSON download."""
2908 self.to_screen(u'Downloading json')
2910 def get_urls(self, jsonData, fmt, bitrate='best'):
2911 """Get urls from 'audio_formats' section in json"""
2914 bitrate_list = jsonData[fmt]
2915 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2916 bitrate = max(bitrate_list) # select highest
2918 url_list = jsonData[fmt][bitrate]
2919 except TypeError: # we have no bitrate info.
2920 url_list = jsonData[fmt]
2923 def check_urls(self, url_list):
2924 """Returns 1st active url from list"""
2925 for url in url_list:
2927 compat_urllib_request.urlopen(url)
2929 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2934 def _print_formats(self, formats):
2935 print('Available formats:')
2936 for fmt in formats.keys():
2937 for b in formats[fmt]:
2939 ext = formats[fmt][b][0]
2940 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2941 except TypeError: # we have no bitrate info
2942 ext = formats[fmt][0]
2943 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2946 def _real_extract(self, url):
2947 mobj = re.match(self._VALID_URL, url)
2949 self._downloader.report_error(u'invalid URL: %s' % url)
2951 # extract uploader & filename from url
2952 uploader = mobj.group(1).decode('utf-8')
2953 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2955 # construct API request
2956 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2957 # retrieve .json file with links to files
2958 request = compat_urllib_request.Request(file_url)
2960 self.report_download_json(file_url)
2961 jsonData = compat_urllib_request.urlopen(request).read()
2962 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2963 self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
2967 json_data = json.loads(jsonData)
2968 player_url = json_data['player_swf_url']
2969 formats = dict(json_data['audio_formats'])
2971 req_format = self._downloader.params.get('format', None)
2974 if self._downloader.params.get('listformats', None):
2975 self._print_formats(formats)
2978 if req_format is None or req_format == 'best':
2979 for format_param in formats.keys():
2980 url_list = self.get_urls(formats, format_param)
2982 file_url = self.check_urls(url_list)
2983 if file_url is not None:
2986 if req_format not in formats:
2987 self._downloader.report_error(u'format is not available')
2990 url_list = self.get_urls(formats, req_format)
2991 file_url = self.check_urls(url_list)
2992 format_param = req_format
2995 'id': file_id.decode('utf-8'),
2996 'url': file_url.decode('utf-8'),
2997 'uploader': uploader.decode('utf-8'),
2998 'upload_date': None,
2999 'title': json_data['name'],
3000 'ext': file_url.split('.')[-1].decode('utf-8'),
3001 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3002 'thumbnail': json_data['thumbnail_url'],
3003 'description': json_data['description'],
3004 'player_url': player_url.decode('utf-8'),
3007 class StanfordOpenClassroomIE(InfoExtractor):
3008 """Information extractor for Stanford's Open ClassRoom"""
3010 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3011 IE_NAME = u'stanfordoc'
3013 def report_download_webpage(self, objid):
3014 """Report information extraction."""
3015 self.to_screen(u'%s: Downloading webpage' % objid)
3017 def _real_extract(self, url):
3018 mobj = re.match(self._VALID_URL, url)
3020 raise ExtractorError(u'Invalid URL: %s' % url)
3022 if mobj.group('course') and mobj.group('video'): # A specific video
3023 course = mobj.group('course')
3024 video = mobj.group('video')
3026 'id': course + '_' + video,
3028 'upload_date': None,
3031 self.report_extraction(info['id'])
3032 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3033 xmlUrl = baseUrl + video + '.xml'
3035 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3036 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3037 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3039 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3041 info['title'] = mdoc.findall('./title')[0].text
3042 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3044 self._downloader.report_error(u'Invalid metadata XML file')
3046 info['ext'] = info['url'].rpartition('.')[2]
3048 elif mobj.group('course'): # A course page
3049 course = mobj.group('course')
3054 'upload_date': None,
3057 coursepage = self._download_webpage(url, info['id'],
3058 note='Downloading course info page',
3059 errnote='Unable to download course info page')
3061 m = re.search('<h1>([^<]+)</h1>', coursepage)
3063 info['title'] = unescapeHTML(m.group(1))
3065 info['title'] = info['id']
3067 m = re.search('<description>([^<]+)</description>', coursepage)
3069 info['description'] = unescapeHTML(m.group(1))
3071 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3074 'type': 'reference',
3075 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3079 for entry in info['list']:
3080 assert entry['type'] == 'reference'
3081 results += self.extract(entry['url'])
3085 'id': 'Stanford OpenClassroom',
3088 'upload_date': None,
3091 self.report_download_webpage(info['id'])
3092 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3094 rootpage = compat_urllib_request.urlopen(rootURL).read()
3095 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3096 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3099 info['title'] = info['id']
3101 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3104 'type': 'reference',
3105 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3110 for entry in info['list']:
3111 assert entry['type'] == 'reference'
3112 results += self.extract(entry['url'])
3115 class MTVIE(InfoExtractor):
3116 """Information extractor for MTV.com"""
3118 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3121 def _real_extract(self, url):
3122 mobj = re.match(self._VALID_URL, url)
3124 self._downloader.report_error(u'invalid URL: %s' % url)
3126 if not mobj.group('proto'):
3127 url = 'http://' + url
3128 video_id = mobj.group('videoid')
3130 webpage = self._download_webpage(url, video_id)
3132 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3134 self._downloader.report_error(u'unable to extract song name')
3136 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3137 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3139 self._downloader.report_error(u'unable to extract performer')
3141 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3142 video_title = performer + ' - ' + song_name
3144 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3146 self._downloader.report_error(u'unable to mtvn_uri')
3148 mtvn_uri = mobj.group(1)
3150 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3152 self._downloader.report_error(u'unable to extract content id')
3154 content_id = mobj.group(1)
3156 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3157 self.report_extraction(video_id)
3158 request = compat_urllib_request.Request(videogen_url)
3160 metadataXml = compat_urllib_request.urlopen(request).read()
3161 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3162 self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3165 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3166 renditions = mdoc.findall('.//rendition')
3168 # For now, always pick the highest quality.
3169 rendition = renditions[-1]
3172 _,_,ext = rendition.attrib['type'].partition('/')
3173 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3174 video_url = rendition.find('./src').text
3176 self._downloader.trouble('Invalid rendition field.')
3182 'uploader': performer,
3183 'upload_date': None,
3184 'title': video_title,
3192 class YoukuIE(InfoExtractor):
3193 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3195 def report_download_webpage(self, file_id):
3196 """Report webpage download."""
3197 self.to_screen(u'%s: Downloading webpage' % file_id)
3200 nowTime = int(time.time() * 1000)
3201 random1 = random.randint(1000,1998)
3202 random2 = random.randint(1000,9999)
3204 return "%d%d%d" %(nowTime,random1,random2)
3206 def _get_file_ID_mix_string(self, seed):
3208 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3210 for i in range(len(source)):
3211 seed = (seed * 211 + 30031 ) % 65536
3212 index = math.floor(seed / 65536 * len(source) )
3213 mixed.append(source[int(index)])
3214 source.remove(source[int(index)])
3215 #return ''.join(mixed)
3218 def _get_file_id(self, fileId, seed):
3219 mixed = self._get_file_ID_mix_string(seed)
3220 ids = fileId.split('*')
3224 realId.append(mixed[int(ch)])
3225 return ''.join(realId)
3227 def _real_extract(self, url):
3228 mobj = re.match(self._VALID_URL, url)
3230 self._downloader.report_error(u'invalid URL: %s' % url)
3232 video_id = mobj.group('ID')
3234 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3236 request = compat_urllib_request.Request(info_url, None, std_headers)
3238 self.report_download_webpage(video_id)
3239 jsondata = compat_urllib_request.urlopen(request).read()
3240 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3241 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3244 self.report_extraction(video_id)
3246 jsonstr = jsondata.decode('utf-8')
3247 config = json.loads(jsonstr)
3249 video_title = config['data'][0]['title']
3250 seed = config['data'][0]['seed']
3252 format = self._downloader.params.get('format', None)
3253 supported_format = list(config['data'][0]['streamfileids'].keys())
3255 if format is None or format == 'best':
3256 if 'hd2' in supported_format:
3261 elif format == 'worst':
3269 fileid = config['data'][0]['streamfileids'][format]
3270 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3271 except (UnicodeDecodeError, ValueError, KeyError):
3272 self._downloader.report_error(u'unable to extract info section')
3276 sid = self._gen_sid()
3277 fileid = self._get_file_id(fileid, seed)
3279 #column 8,9 of fileid represent the segment number
3280 #fileid[7:9] should be changed
3281 for index, key in enumerate(keys):
3283 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3284 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3287 'id': '%s_part%02d' % (video_id, index),
3288 'url': download_url,
3290 'upload_date': None,
3291 'title': video_title,
3294 files_info.append(info)
3299 class XNXXIE(InfoExtractor):
3300 """Information extractor for xnxx.com"""
3302 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3304 VIDEO_URL_RE = r'flv_url=(.*?)&'
3305 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3306 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3308 def report_webpage(self, video_id):
3309 """Report information extraction"""
3310 self.to_screen(u'%s: Downloading webpage' % video_id)
3312 def _real_extract(self, url):
3313 mobj = re.match(self._VALID_URL, url)
3315 self._downloader.report_error(u'invalid URL: %s' % url)
3317 video_id = mobj.group(1)
3319 self.report_webpage(video_id)
3321 # Get webpage content
3323 webpage_bytes = compat_urllib_request.urlopen(url).read()
3324 webpage = webpage_bytes.decode('utf-8')
3325 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3326 self._downloader.report_error(u'unable to download video webpage: %s' % err)
3329 result = re.search(self.VIDEO_URL_RE, webpage)
3331 self._downloader.report_error(u'unable to extract video url')
3333 video_url = compat_urllib_parse.unquote(result.group(1))
3335 result = re.search(self.VIDEO_TITLE_RE, webpage)
3337 self._downloader.report_error(u'unable to extract video title')
3339 video_title = result.group(1)
3341 result = re.search(self.VIDEO_THUMB_RE, webpage)
3343 self._downloader.report_error(u'unable to extract video thumbnail')
3345 video_thumbnail = result.group(1)
3351 'upload_date': None,
3352 'title': video_title,
3354 'thumbnail': video_thumbnail,
3355 'description': None,
3359 class GooglePlusIE(InfoExtractor):
3360 """Information extractor for plus.google.com."""
3362 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3363 IE_NAME = u'plus.google'
3365 def __init__(self, downloader=None):
3366 InfoExtractor.__init__(self, downloader)
3368 def report_extract_entry(self, url):
3369 """Report downloading extry"""
3370 self.to_screen(u'Downloading entry: %s' % url)
3372 def report_date(self, upload_date):
3373 """Report downloading extry"""
3374 self.to_screen(u'Entry date: %s' % upload_date)
3376 def report_uploader(self, uploader):
3377 """Report downloading extry"""
3378 self.to_screen(u'Uploader: %s' % uploader)
3380 def report_title(self, video_title):
3381 """Report downloading extry"""
3382 self.to_screen(u'Title: %s' % video_title)
3384 def report_extract_vid_page(self, video_page):
3385 """Report information extraction."""
3386 self.to_screen(u'Extracting video page: %s' % video_page)
3388 def _real_extract(self, url):
3389 # Extract id from URL
3390 mobj = re.match(self._VALID_URL, url)
3392 self._downloader.report_error(u'Invalid URL: %s' % url)
3395 post_url = mobj.group(0)
3396 video_id = mobj.group(1)
3398 video_extension = 'flv'
3400 # Step 1, Retrieve post webpage to extract further information
3401 self.report_extract_entry(post_url)
3402 request = compat_urllib_request.Request(post_url)
3404 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3405 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3406 self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3409 # Extract update date
3411 pattern = 'title="Timestamp">(.*?)</a>'
3412 mobj = re.search(pattern, webpage)
3414 upload_date = mobj.group(1)
3415 # Convert timestring to a format suitable for filename
3416 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3417 upload_date = upload_date.strftime('%Y%m%d')
3418 self.report_date(upload_date)
3422 pattern = r'rel\="author".*?>(.*?)</a>'
3423 mobj = re.search(pattern, webpage)
3425 uploader = mobj.group(1)
3426 self.report_uploader(uploader)
3429 # Get the first line for title
3431 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3432 mobj = re.search(pattern, webpage)
3434 video_title = mobj.group(1)
3435 self.report_title(video_title)
3437 # Step 2, Stimulate clicking the image box to launch video
3438 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3439 mobj = re.search(pattern, webpage)
3441 self._downloader.report_error(u'unable to extract video page URL')
3443 video_page = mobj.group(1)
3444 request = compat_urllib_request.Request(video_page)
3446 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3447 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3448 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3450 self.report_extract_vid_page(video_page)
3453 # Extract video links on video page
3454 """Extract video links of all sizes"""
3455 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3456 mobj = re.findall(pattern, webpage)
3458 self._downloader.report_error(u'unable to extract video links')
3460 # Sort in resolution
3461 links = sorted(mobj)
3463 # Choose the lowest of the sort, i.e. highest resolution
3464 video_url = links[-1]
3465 # Only get the url. The resolution part in the tuple has no use anymore
3466 video_url = video_url[-1]
3467 # Treat escaped \u0026 style hex
3469 video_url = video_url.decode("unicode_escape")
3470 except AttributeError: # Python 3
3471 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3477 'uploader': uploader,
3478 'upload_date': upload_date,
3479 'title': video_title,
3480 'ext': video_extension,
3483 class NBAIE(InfoExtractor):
3484 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3487 def _real_extract(self, url):
3488 mobj = re.match(self._VALID_URL, url)
3490 self._downloader.report_error(u'invalid URL: %s' % url)
3493 video_id = mobj.group(1)
3494 if video_id.endswith('/index.html'):
3495 video_id = video_id[:-len('/index.html')]
3497 webpage = self._download_webpage(url, video_id)
3499 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3500 def _findProp(rexp, default=None):
3501 m = re.search(rexp, webpage)
3503 return unescapeHTML(m.group(1))
3507 shortened_video_id = video_id.rpartition('/')[2]
3508 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3510 'id': shortened_video_id,
3514 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3515 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3519 class JustinTVIE(InfoExtractor):
3520 """Information extractor for justin.tv and twitch.tv"""
3521 # TODO: One broadcast may be split into multiple videos. The key
3522 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3523 # starts at 1 and increases. Can we treat all parts as one video?
3525 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3526 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3527 _JUSTIN_PAGE_LIMIT = 100
3528 IE_NAME = u'justin.tv'
3530 def report_download_page(self, channel, offset):
3531 """Report attempt to download a single page of videos."""
3532 self.to_screen(u'%s: Downloading video information from %d to %d' %
3533 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3535 # Return count of items, list of *valid* items
3536 def _parse_page(self, url):
3538 urlh = compat_urllib_request.urlopen(url)
3539 webpage_bytes = urlh.read()
3540 webpage = webpage_bytes.decode('utf-8', 'ignore')
3541 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3542 self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3545 response = json.loads(webpage)
3546 if type(response) != list:
3547 error_text = response.get('error', 'unknown error')
3548 self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3551 for clip in response:
3552 video_url = clip['video_file_url']
3554 video_extension = os.path.splitext(video_url)[1][1:]
3555 video_date = re.sub('-', '', clip['start_time'][:10])
3556 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3557 video_id = clip['id']
3558 video_title = clip.get('title', video_id)
3562 'title': video_title,
3563 'uploader': clip.get('channel_name', video_uploader_id),
3564 'uploader_id': video_uploader_id,
3565 'upload_date': video_date,
3566 'ext': video_extension,
3568 return (len(response), info)
3570 def _real_extract(self, url):
3571 mobj = re.match(self._VALID_URL, url)
3573 self._downloader.report_error(u'invalid URL: %s' % url)
3576 api = 'http://api.justin.tv'
3577 video_id = mobj.group(mobj.lastindex)
3579 if mobj.lastindex == 1:
3581 api += '/channel/archives/%s.json'
3583 api += '/broadcast/by_archive/%s.json'
3584 api = api % (video_id,)
3586 self.report_extraction(video_id)
3590 limit = self._JUSTIN_PAGE_LIMIT
3593 self.report_download_page(video_id, offset)
3594 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3595 page_count, page_info = self._parse_page(page_url)
3596 info.extend(page_info)
3597 if not paged or page_count != limit:
3602 class FunnyOrDieIE(InfoExtractor):
3603 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3605 def _real_extract(self, url):
3606 mobj = re.match(self._VALID_URL, url)
3608 self._downloader.report_error(u'invalid URL: %s' % url)
3611 video_id = mobj.group('id')
3612 webpage = self._download_webpage(url, video_id)
3614 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3616 self._downloader.report_error(u'unable to find video information')
3617 video_url = unescapeHTML(m.group('url'))
3619 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3621 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3623 self._downloader.trouble(u'Cannot find video title')
3624 title = clean_html(m.group('title'))
3626 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3628 desc = unescapeHTML(m.group('desc'))
3637 'description': desc,
3641 class SteamIE(InfoExtractor):
3642 _VALID_URL = r"""http://store.steampowered.com/
3643 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3645 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3649 def suitable(cls, url):
3650 """Receives a URL and returns True if suitable for this IE."""
3651 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3653 def _real_extract(self, url):
3654 m = re.match(self._VALID_URL, url, re.VERBOSE)
3655 gameID = m.group('gameID')
3656 videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3657 self.report_age_confirmation()
3658 webpage = self._download_webpage(videourl, gameID)
3659 game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3661 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3662 mweb = re.finditer(urlRE, webpage)
3663 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3664 titles = re.finditer(namesRE, webpage)
3665 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3666 thumbs = re.finditer(thumbsRE, webpage)
3668 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3669 video_id = vid.group('videoID')
3670 title = vtitle.group('videoName')
3671 video_url = vid.group('videoURL')
3672 video_thumb = thumb.group('thumbnail')
3674 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3679 'title': unescapeHTML(title),
3680 'thumbnail': video_thumb
3683 return [self.playlist_result(videos, gameID, game_title)]
3685 class UstreamIE(InfoExtractor):
3686 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3687 IE_NAME = u'ustream'
3689 def _real_extract(self, url):
3690 m = re.match(self._VALID_URL, url)
3691 video_id = m.group('videoID')
3692 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3693 webpage = self._download_webpage(url, video_id)
3694 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3695 title = m.group('title')
3696 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3697 uploader = m.group('uploader')
3703 'uploader': uploader
3707 class WorldStarHipHopIE(InfoExtractor):
3708 _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3709 IE_NAME = u'WorldStarHipHop'
3711 def _real_extract(self, url):
3712 _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3714 webpage_src = compat_urllib_request.urlopen(url).read()
3715 webpage_src = webpage_src.decode('utf-8')
3717 mobj = re.search(_src_url, webpage_src)
3719 m = re.match(self._VALID_URL, url)
3720 video_id = m.group('id')
3722 if mobj is not None:
3723 video_url = mobj.group()
3724 if 'mp4' in video_url:
3729 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3732 _title = r"""<title>(.*)</title>"""
3734 mobj = re.search(_title, webpage_src)
3736 if mobj is not None:
3737 title = mobj.group(1)
3739 title = 'World Start Hip Hop - %s' % time.ctime()
3741 _thumbnail = r"""rel="image_src" href="(.*)" />"""
3742 mobj = re.search(_thumbnail, webpage_src)
3744 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3745 if mobj is not None:
3746 thumbnail = mobj.group(1)
3748 _title = r"""candytitles.*>(.*)</span>"""
3749 mobj = re.search(_title, webpage_src)
3750 if mobj is not None:
3751 title = mobj.group(1)
3758 'thumbnail' : thumbnail,
3763 class RBMARadioIE(InfoExtractor):
3764 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3766 def _real_extract(self, url):
3767 m = re.match(self._VALID_URL, url)
3768 video_id = m.group('videoID')
3770 webpage = self._download_webpage(url, video_id)
3771 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3773 raise ExtractorError(u'Cannot find metadata')
3774 json_data = m.group(1)
3777 data = json.loads(json_data)
3778 except ValueError as e:
3779 raise ExtractorError(u'Invalid JSON: ' + str(e))
3781 video_url = data['akamai_url'] + '&cbr=256'
3782 url_parts = compat_urllib_parse_urlparse(video_url)
3783 video_ext = url_parts.path.rpartition('.')[2]
3788 'title': data['title'],
3789 'description': data.get('teaser_text'),
3790 'location': data.get('country_of_origin'),
3791 'uploader': data.get('host', {}).get('name'),
3792 'uploader_id': data.get('host', {}).get('slug'),
3793 'thumbnail': data.get('image', {}).get('large_url_2x'),
3794 'duration': data.get('duration'),
3799 class YouPornIE(InfoExtractor):
3800 """Information extractor for youporn.com."""
3801 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3803 def _print_formats(self, formats):
3804 """Print all available formats"""
3805 print(u'Available formats:')
3806 print(u'ext\t\tformat')
3807 print(u'---------------------------------')
3808 for format in formats:
3809 print(u'%s\t\t%s' % (format['ext'], format['format']))
3811 def _specific(self, req_format, formats):
3813 if(x["format"]==req_format):
3817 def _real_extract(self, url):
3818 mobj = re.match(self._VALID_URL, url)
3820 self._downloader.report_error(u'invalid URL: %s' % url)
3823 video_id = mobj.group('videoid')
3825 req = compat_urllib_request.Request(url)
3826 req.add_header('Cookie', 'age_verified=1')
3827 webpage = self._download_webpage(req, video_id)
3829 # Get the video title
3830 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3832 raise ExtractorError(u'Unable to extract video title')
3833 video_title = result.group('title').strip()
3835 # Get the video date
3836 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3838 self._downloader.report_warning(u'unable to extract video date')
3841 upload_date = result.group('date').strip()
3843 # Get the video uploader
3844 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3846 self._downloader.report_warning(u'unable to extract uploader')
3847 video_uploader = None
3849 video_uploader = result.group('uploader').strip()
3850 video_uploader = clean_html( video_uploader )
3852 # Get all of the formats available
3853 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3854 result = re.search(DOWNLOAD_LIST_RE, webpage)
3856 raise ExtractorError(u'Unable to extract download list')
3857 download_list_html = result.group('download_list').strip()
3859 # Get all of the links from the page
3860 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3861 links = re.findall(LINK_RE, download_list_html)
3862 if(len(links) == 0):
3863 raise ExtractorError(u'ERROR: no known formats available for video')
3865 self.to_screen(u'Links found: %d' % len(links))
3870 # A link looks like this:
3871 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3872 # A path looks like this:
3873 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3874 video_url = unescapeHTML( link )
3875 path = compat_urllib_parse_urlparse( video_url ).path
3876 extension = os.path.splitext( path )[1][1:]
3877 format = path.split('/')[4].split('_')[:2]
3880 format = "-".join( format )
3881 title = u'%s-%s-%s' % (video_title, size, bitrate)
3886 'uploader': video_uploader,
3887 'upload_date': upload_date,
3892 'description': None,
3896 if self._downloader.params.get('listformats', None):
3897 self._print_formats(formats)
3900 req_format = self._downloader.params.get('format', None)
3901 self.to_screen(u'Format: %s' % req_format)
3903 if req_format is None or req_format == 'best':
3905 elif req_format == 'worst':
3906 return [formats[-1]]
3907 elif req_format in ('-1', 'all'):
3910 format = self._specific( req_format, formats )
3912 self._downloader.report_error(u'requested format not available')
3918 class PornotubeIE(InfoExtractor):
3919 """Information extractor for pornotube.com."""
3920 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3922 def _real_extract(self, url):
3923 mobj = re.match(self._VALID_URL, url)
3925 self._downloader.report_error(u'invalid URL: %s' % url)
3928 video_id = mobj.group('videoid')
3929 video_title = mobj.group('title')
3931 # Get webpage content
3932 webpage = self._download_webpage(url, video_id)
3935 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3936 result = re.search(VIDEO_URL_RE, webpage)
3938 self._downloader.report_error(u'unable to extract video url')
3940 video_url = compat_urllib_parse.unquote(result.group('url'))
3942 #Get the uploaded date
3943 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3944 result = re.search(VIDEO_UPLOADED_RE, webpage)
3946 self._downloader.report_error(u'unable to extract video title')
3948 upload_date = result.group('date')
3950 info = {'id': video_id,
3953 'upload_date': upload_date,
3954 'title': video_title,
3960 class YouJizzIE(InfoExtractor):
3961 """Information extractor for youjizz.com."""
3962 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3964 def _real_extract(self, url):
3965 mobj = re.match(self._VALID_URL, url)
3967 self._downloader.report_error(u'invalid URL: %s' % url)
3970 video_id = mobj.group('videoid')
3972 # Get webpage content
3973 webpage = self._download_webpage(url, video_id)
3975 # Get the video title
3976 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3978 raise ExtractorError(u'ERROR: unable to extract video title')
3979 video_title = result.group('title').strip()
3981 # Get the embed page
3982 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3984 raise ExtractorError(u'ERROR: unable to extract embed page')
3986 embed_page_url = result.group(0).strip()
3987 video_id = result.group('videoid')
3989 webpage = self._download_webpage(embed_page_url, video_id)
3992 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3994 raise ExtractorError(u'ERROR: unable to extract video url')
3995 video_url = result.group('source')
3997 info = {'id': video_id,
3999 'title': video_title,
4002 'player_url': embed_page_url}
4006 class EightTracksIE(InfoExtractor):
4008 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4010 def _real_extract(self, url):
4011 mobj = re.match(self._VALID_URL, url)
4013 raise ExtractorError(u'Invalid URL: %s' % url)
4014 playlist_id = mobj.group('id')
4016 webpage = self._download_webpage(url, playlist_id)
4018 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4020 raise ExtractorError(u'Cannot find trax information')
4021 json_like = m.group(1)
4022 data = json.loads(json_like)
4024 session = str(random.randint(0, 1000000000))
4026 track_count = data['tracks_count']
4027 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4028 next_url = first_url
4030 for i in itertools.count():
4031 api_json = self._download_webpage(next_url, playlist_id,
4032 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4033 errnote=u'Failed to download song information')
4034 api_data = json.loads(api_json)
4035 track_data = api_data[u'set']['track']
4037 'id': track_data['id'],
4038 'url': track_data['track_file_stream_url'],
4039 'title': track_data['performer'] + u' - ' + track_data['name'],
4040 'raw_title': track_data['name'],
4041 'uploader_id': data['user']['login'],
4045 if api_data['set']['at_last_track']:
4047 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4050 class KeekIE(InfoExtractor):
4051 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4054 def _real_extract(self, url):
4055 m = re.match(self._VALID_URL, url)
4056 video_id = m.group('videoID')
4057 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4058 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4059 webpage = self._download_webpage(url, video_id)
4060 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4061 title = unescapeHTML(m.group('title'))
4062 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4063 uploader = clean_html(m.group('uploader'))
4069 'thumbnail': thumbnail,
4070 'uploader': uploader
4074 class TEDIE(InfoExtractor):
4075 _VALID_URL=r'''http://www.ted.com/
4077 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4079 ((?P<type_talk>talks)) # We have a simple talk
4081 /(?P<name>\w+) # Here goes the name and then ".html"
4085 def suitable(cls, url):
4086 """Receives a URL and returns True if suitable for this IE."""
4087 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4089 def _real_extract(self, url):
4090 m=re.match(self._VALID_URL, url, re.VERBOSE)
4091 if m.group('type_talk'):
4092 return [self._talk_info(url)]
4094 playlist_id=m.group('playlist_id')
4095 name=m.group('name')
4096 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
4097 return [self._playlist_videos_info(url,name,playlist_id)]
4099 def _talk_video_link(self,mediaSlug):
4100 '''Returns the video link for that mediaSlug'''
4101 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4103 def _playlist_videos_info(self,url,name,playlist_id=0):
4104 '''Returns the videos of the playlist'''
4106 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4107 ([.\s]*?)data-playlist_item_id="(\d+)"
4108 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4110 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4111 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4112 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4113 m_names=re.finditer(video_name_RE,webpage)
4115 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
4116 m_playlist = re.search(playlist_RE, webpage)
4117 playlist_title = m_playlist.group('playlist_title')
4119 playlist_entries = []
4120 for m_video, m_name in zip(m_videos,m_names):
4121 video_id=m_video.group('video_id')
4122 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4123 playlist_entries.append(self.url_result(talk_url, 'TED'))
4124 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
4126 def _talk_info(self, url, video_id=0):
4127 """Return the video for the talk in the url"""
4128 m=re.match(self._VALID_URL, url,re.VERBOSE)
4129 videoName=m.group('name')
4130 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4131 # If the url includes the language we get the title translated
4132 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4133 title=re.search(title_RE, webpage).group('title')
4134 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4135 "id":(?P<videoID>[\d]+).*?
4136 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4137 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4138 thumb_match=re.search(thumb_RE,webpage)
4139 info_match=re.search(info_RE,webpage,re.VERBOSE)
4140 video_id=info_match.group('videoID')
4141 mediaSlug=info_match.group('mediaSlug')
4142 video_url=self._talk_video_link(mediaSlug)
4148 'thumbnail': thumb_match.group('thumbnail')
4152 class MySpassIE(InfoExtractor):
4153 _VALID_URL = r'http://www.myspass.de/.*'
4155 def _real_extract(self, url):
4156 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4158 # video id is the last path element of the URL
4159 # usually there is a trailing slash, so also try the second but last
4160 url_path = compat_urllib_parse_urlparse(url).path
4161 url_parent_path, video_id = os.path.split(url_path)
4163 _, video_id = os.path.split(url_parent_path)
4166 metadata_url = META_DATA_URL_TEMPLATE % video_id
4167 metadata_text = self._download_webpage(metadata_url, video_id)
4168 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4170 # extract values from metadata
4171 url_flv_el = metadata.find('url_flv')
4172 if url_flv_el is None:
4173 self._downloader.report_error(u'unable to extract download url')
4175 video_url = url_flv_el.text
4176 extension = os.path.splitext(video_url)[1][1:]
4177 title_el = metadata.find('title')
4178 if title_el is None:
4179 self._downloader.report_error(u'unable to extract title')
4181 title = title_el.text
4182 format_id_el = metadata.find('format_id')
4183 if format_id_el is None:
4186 format = format_id_el.text
4187 description_el = metadata.find('description')
4188 if description_el is not None:
4189 description = description_el.text
4192 imagePreview_el = metadata.find('imagePreview')
4193 if imagePreview_el is not None:
4194 thumbnail = imagePreview_el.text
4203 'thumbnail': thumbnail,
4204 'description': description
4208 class SpiegelIE(InfoExtractor):
4209 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4211 def _real_extract(self, url):
4212 m = re.match(self._VALID_URL, url)
4213 video_id = m.group('videoID')
4215 webpage = self._download_webpage(url, video_id)
4216 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4218 raise ExtractorError(u'Cannot find title')
4219 video_title = unescapeHTML(m.group(1))
4221 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4222 xml_code = self._download_webpage(xml_url, video_id,
4223 note=u'Downloading XML', errnote=u'Failed to download XML')
4225 idoc = xml.etree.ElementTree.fromstring(xml_code)
4226 last_type = idoc[-1]
4227 filename = last_type.findall('./filename')[0].text
4228 duration = float(last_type.findall('./duration')[0].text)
4230 video_url = 'http://video2.spiegel.de/flash/' + filename
4231 video_ext = filename.rpartition('.')[2]
4236 'title': video_title,
4237 'duration': duration,
4241 class LiveLeakIE(InfoExtractor):
4243 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4244 IE_NAME = u'liveleak'
4246 def _real_extract(self, url):
4247 mobj = re.match(self._VALID_URL, url)
4249 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
4252 video_id = mobj.group('video_id')
4254 webpage = self._download_webpage(url, video_id)
4256 m = re.search(r'file: "(.*?)",', webpage)
4258 self._downloader.report_error(u'unable to find video url')
4260 video_url = m.group(1)
4262 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4264 self._downloader.trouble(u'Cannot find video title')
4265 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4267 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4269 desc = unescapeHTML(m.group('desc'))
4273 m = re.search(r'By:.*?(\w+)</a>', webpage)
4275 uploader = clean_html(m.group(1))
4284 'description': desc,
4285 'uploader': uploader
4290 class ARDIE(InfoExtractor):
4291 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4292 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4293 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4295 def _real_extract(self, url):
4296 # determine video id from url
4297 m = re.match(self._VALID_URL, url)
4299 numid = re.search(r'documentId=([0-9]+)', url)
4301 video_id = numid.group(1)
4303 video_id = m.group('video_id')
4305 # determine title and media streams from webpage
4306 html = self._download_webpage(url, video_id)
4307 title = re.search(self._TITLE, html).group('title')
4308 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4310 assert '"fsk"' in html
4311 self._downloader.report_error(u'this video is only available after 8:00 pm')
4314 # choose default media type and highest quality for now
4315 stream = max([s for s in streams if int(s["media_type"]) == 0],
4316 key=lambda s: int(s["quality"]))
4318 # there's two possibilities: RTMP stream or HTTP download
4319 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4320 if stream['rtmp_url']:
4321 self.to_screen(u'RTMP download detected')
4322 assert stream['video_url'].startswith('mp4:')
4323 info["url"] = stream["rtmp_url"]
4324 info["play_path"] = stream['video_url']
4326 assert stream["video_url"].endswith('.mp4')
4327 info["url"] = stream["video_url"]
4330 class TumblrIE(InfoExtractor):
4331 _VALID_URL = r'http://(?P<blog_name>.*?).tumblr.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4333 def _real_extract(self, url):
4334 m_url = re.match(self._VALID_URL, url)
4335 video_id = m_url.group('id')
4336 blog = m_url.group('blog_name')
4338 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4339 webpage = self._download_webpage(url, video_id)
4341 re_video = r'src=\\x22(?P<video_url>http://%s.tumblr.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4342 video = re.search(re_video, webpage)
4344 self.to_screen("No video founded")
4346 video_url = video.group('video_url')
4347 ext = video.group('ext')
4349 re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22' # We pick the first poster
4350 thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4352 # The only place where you can get a title, it's not complete,
4353 # but searching in other places doesn't work for all videos
4354 re_title = r'<title>(.*?) - (?P<title>.*?)</title>'
4355 title = unescapeHTML(re.search(re_title, webpage).group('title'))
4357 return [{'id': video_id,
4365 def gen_extractors():
4366 """ Return a list of an instance of every supported extractor.
4367 The order does matter; the first extractor matched is the one handling the URL.
4370 YoutubePlaylistIE(),
4395 StanfordOpenClassroomIE(),
4405 WorldStarHipHopIE(),
4422 def get_info_extractor(ie_name):
4423 """Returns the info extractor class with the given ie_name"""
4424 return globals()[ie_name+'IE']