2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The subtitle file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 note = u'Downloading video webpage'
118 if note is not False:
119 self.to_screen(u'%s: %s' % (video_id, note))
121 return compat_urllib_request.urlopen(url_or_request)
122 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
124 errnote = u'Unable to download webpage'
125 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
127 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
128 """ Returns the data of the page as a string """
129 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
130 content_type = urlh.headers.get('Content-Type', '')
131 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
133 encoding = m.group(1)
136 webpage_bytes = urlh.read()
137 if self._downloader.params.get('dump_intermediate_pages', False):
139 url = url_or_request.get_full_url()
140 except AttributeError:
142 self.to_screen(u'Dumping request to ' + url)
143 dump = base64.b64encode(webpage_bytes).decode('ascii')
144 self._downloader.to_screen(dump)
145 return webpage_bytes.decode(encoding, 'replace')
147 def to_screen(self, msg):
148 """Print msg to screen, prefixing it with '[ie_name]'"""
149 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
151 def report_extraction(self, id_or_name):
152 """Report information extraction."""
153 self.to_screen(u'%s: Extracting information' % id_or_name)
155 def report_age_confirmation(self):
156 """Report attempt to confirm age."""
157 self.to_screen(u'Confirming age')
159 #Methods for following #608
160 #They set the correct value of the '_type' key
161 def video_result(self, video_info):
162 """Returns a video"""
163 video_info['_type'] = 'video'
165 def url_result(self, url, ie=None):
166 """Returns a url that points to a page that should be processed"""
167 #TODO: ie should be the class used for getting the info
168 video_info = {'_type': 'url',
172 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
173 """Returns a playlist"""
174 video_info = {'_type': 'playlist',
177 video_info['id'] = playlist_id
179 video_info['title'] = playlist_title
183 class YoutubeIE(InfoExtractor):
184 """Information extractor for youtube.com."""
188 (?:https?://)? # http(s):// (optional)
189 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
190 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
191 (?:.*?\#/)? # handle anchor (#/) redirect urls
192 (?: # the various things that can precede the ID:
193 (?:(?:v|embed|e)/) # v/ or embed/ or e/
194 |(?: # or the v= param in all its forms
195 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
196 (?:\?|\#!?) # the params delimiter ? or # or #!
197 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
200 )? # optional -> youtube.com/xxxx is OK
201 )? # all until now is optional -> you can pass the naked ID
202 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
203 (?(1).+)? # if we found the ID, everything can follow
205 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
206 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
207 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
208 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
209 _NETRC_MACHINE = 'youtube'
210 # Listed in order of quality
211 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
212 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
213 _video_extensions = {
219 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
225 _video_dimensions = {
244 def suitable(cls, url):
245 """Receives a URL and returns True if suitable for this IE."""
246 if YoutubePlaylistIE.suitable(url): return False
247 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
249 def report_lang(self):
250 """Report attempt to set language."""
251 self.to_screen(u'Setting language')
253 def report_login(self):
254 """Report attempt to log in."""
255 self.to_screen(u'Logging in')
257 def report_video_webpage_download(self, video_id):
258 """Report attempt to download video webpage."""
259 self.to_screen(u'%s: Downloading video webpage' % video_id)
261 def report_video_info_webpage_download(self, video_id):
262 """Report attempt to download video info webpage."""
263 self.to_screen(u'%s: Downloading video info webpage' % video_id)
265 def report_video_subtitles_download(self, video_id):
266 """Report attempt to download video info webpage."""
267 self.to_screen(u'%s: Checking available subtitles' % video_id)
269 def report_video_subtitles_request(self, video_id, sub_lang, format):
270 """Report attempt to download video info webpage."""
271 self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
273 def report_video_subtitles_available(self, video_id, sub_lang_list):
274 """Report available subtitles."""
275 sub_lang = ",".join(list(sub_lang_list.keys()))
276 self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
278 def report_information_extraction(self, video_id):
279 """Report attempt to extract video information."""
280 self.to_screen(u'%s: Extracting video information' % video_id)
282 def report_unavailable_format(self, video_id, format):
283 """Report extracted video URL."""
284 self.to_screen(u'%s: Format %s not available' % (video_id, format))
286 def report_rtmp_download(self):
287 """Indicate the download will use the RTMP protocol."""
288 self.to_screen(u'RTMP download detected')
290 def _get_available_subtitles(self, video_id):
291 self.report_video_subtitles_download(video_id)
292 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
294 sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
295 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
296 return (u'unable to download video subtitles: %s' % compat_str(err), None)
297 sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
298 sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
299 if not sub_lang_list:
300 return (u'video doesn\'t have subtitles', None)
303 def _list_available_subtitles(self, video_id):
304 sub_lang_list = self._get_available_subtitles(video_id)
305 self.report_video_subtitles_available(video_id, sub_lang_list)
307 def _request_subtitle(self, sub_lang, sub_name, video_id, format):
310 (error_message, sub_lang, sub)
312 self.report_video_subtitles_request(video_id, sub_lang, format)
313 params = compat_urllib_parse.urlencode({
319 url = 'http://www.youtube.com/api/timedtext?' + params
321 sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
322 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
323 return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
325 return (u'Did not fetch video subtitles', None, None)
326 return (None, sub_lang, sub)
328 def _extract_subtitle(self, video_id):
330 Return a list with a tuple:
331 [(error_message, sub_lang, sub)]
333 sub_lang_list = self._get_available_subtitles(video_id)
334 sub_format = self._downloader.params.get('subtitlesformat')
335 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
336 return [(sub_lang_list[0], None, None)]
337 if self._downloader.params.get('subtitleslang', False):
338 sub_lang = self._downloader.params.get('subtitleslang')
339 elif 'en' in sub_lang_list:
342 sub_lang = list(sub_lang_list.keys())[0]
343 if not sub_lang in sub_lang_list:
344 return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
346 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
349 def _extract_all_subtitles(self, video_id):
350 sub_lang_list = self._get_available_subtitles(video_id)
351 sub_format = self._downloader.params.get('subtitlesformat')
352 if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
353 return [(sub_lang_list[0], None, None)]
355 for sub_lang in sub_lang_list:
356 subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
357 subtitles.append(subtitle)
360 def _print_formats(self, formats):
361 print('Available formats:')
363 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
365 def _real_initialize(self):
366 if self._downloader is None:
371 downloader_params = self._downloader.params
373 # Attempt to use provided username and password or .netrc data
374 if downloader_params.get('username', None) is not None:
375 username = downloader_params['username']
376 password = downloader_params['password']
377 elif downloader_params.get('usenetrc', False):
379 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
384 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
385 except (IOError, netrc.NetrcParseError) as err:
386 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
390 request = compat_urllib_request.Request(self._LANG_URL)
393 compat_urllib_request.urlopen(request).read()
394 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
395 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
398 # No authentication to be performed
402 request = compat_urllib_request.Request(self._LOGIN_URL)
404 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
405 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
406 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
411 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
413 galx = match.group(1)
415 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
421 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
425 u'PersistentCookie': u'yes',
427 u'bgresponse': u'js_disabled',
428 u'checkConnection': u'',
429 u'checkedDomains': u'youtube',
435 u'signIn': u'Sign in',
437 u'service': u'youtube',
441 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
443 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
444 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
445 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
448 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
449 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
450 self._downloader.report_warning(u'unable to log in: bad username or password')
452 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
453 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
459 'action_confirm': 'Confirm',
461 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
463 self.report_age_confirmation()
464 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
465 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
466 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
469 def _extract_id(self, url):
470 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
472 self._downloader.report_error(u'invalid URL: %s' % url)
474 video_id = mobj.group(2)
477 def _real_extract(self, url):
478 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
479 mobj = re.search(self._NEXT_URL_RE, url)
481 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
482 video_id = self._extract_id(url)
485 self.report_video_webpage_download(video_id)
486 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
487 request = compat_urllib_request.Request(url)
489 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
490 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
491 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
494 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
496 # Attempt to extract SWF player URL
497 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
499 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
504 self.report_video_info_webpage_download(video_id)
505 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
506 video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
507 % (video_id, el_type))
508 video_info_webpage = self._download_webpage(video_info_url, video_id,
510 errnote='unable to download video info webpage')
511 video_info = compat_parse_qs(video_info_webpage)
512 if 'token' in video_info:
514 if 'token' not in video_info:
515 if 'reason' in video_info:
516 self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
518 self._downloader.report_error(u'"token" parameter not in video info for unknown reason')
521 # Check for "rental" videos
522 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
523 self._downloader.report_error(u'"rental" videos not supported')
526 # Start extracting information
527 self.report_information_extraction(video_id)
530 if 'author' not in video_info:
531 self._downloader.report_error(u'unable to extract uploader name')
533 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
536 video_uploader_id = None
537 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
539 video_uploader_id = mobj.group(1)
541 self._downloader.report_warning(u'unable to extract uploader nickname')
544 if 'title' not in video_info:
545 self._downloader.report_error(u'unable to extract video title')
547 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
550 if 'thumbnail_url' not in video_info:
551 self._downloader.report_warning(u'unable to extract video thumbnail')
553 else: # don't panic if we can't find it
554 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
558 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
560 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
561 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
562 for expression in format_expressions:
564 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
569 video_description = get_element_by_id("eow-description", video_webpage)
570 if video_description:
571 video_description = clean_html(video_description)
573 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
575 video_description = unescapeHTML(fd_mobj.group(1))
577 video_description = u''
580 video_subtitles = None
582 if self._downloader.params.get('writesubtitles', False):
583 video_subtitles = self._extract_subtitle(video_id)
585 (sub_error, sub_lang, sub) = video_subtitles[0]
587 self._downloader.report_error(sub_error)
589 if self._downloader.params.get('allsubtitles', False):
590 video_subtitles = self._extract_all_subtitles(video_id)
591 for video_subtitle in video_subtitles:
592 (sub_error, sub_lang, sub) = video_subtitle
594 self._downloader.report_error(sub_error)
596 if self._downloader.params.get('listsubtitles', False):
597 sub_lang_list = self._list_available_subtitles(video_id)
600 if 'length_seconds' not in video_info:
601 self._downloader.report_warning(u'unable to extract video duration')
604 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
607 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
609 # Decide which formats to download
610 req_format = self._downloader.params.get('format', None)
612 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
613 self.report_rtmp_download()
614 video_url_list = [(None, video_info['conn'][0])]
615 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
616 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
617 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
618 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
619 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
621 format_limit = self._downloader.params.get('format_limit', None)
622 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
623 if format_limit is not None and format_limit in available_formats:
624 format_list = available_formats[available_formats.index(format_limit):]
626 format_list = available_formats
627 existing_formats = [x for x in format_list if x in url_map]
628 if len(existing_formats) == 0:
629 raise ExtractorError(u'no known formats available for video')
630 if self._downloader.params.get('listformats', None):
631 self._print_formats(existing_formats)
633 if req_format is None or req_format == 'best':
634 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
635 elif req_format == 'worst':
636 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
637 elif req_format in ('-1', 'all'):
638 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
640 # Specific formats. We pick the first in a slash-delimeted sequence.
641 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
642 req_formats = req_format.split('/')
643 video_url_list = None
644 for rf in req_formats:
646 video_url_list = [(rf, url_map[rf])]
648 if video_url_list is None:
649 raise ExtractorError(u'requested format not available')
651 raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
654 for format_param, video_real_url in video_url_list:
656 video_extension = self._video_extensions.get(format_param, 'flv')
658 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
659 self._video_dimensions.get(format_param, '???'))
663 'url': video_real_url,
664 'uploader': video_uploader,
665 'uploader_id': video_uploader_id,
666 'upload_date': upload_date,
667 'title': video_title,
668 'ext': video_extension,
669 'format': video_format,
670 'thumbnail': video_thumbnail,
671 'description': video_description,
672 'player_url': player_url,
673 'subtitles': video_subtitles,
674 'duration': video_duration
679 class MetacafeIE(InfoExtractor):
680 """Information Extractor for metacafe.com."""
682 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
683 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
684 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
685 IE_NAME = u'metacafe'
687 def __init__(self, downloader=None):
688 InfoExtractor.__init__(self, downloader)
690 def report_disclaimer(self):
691 """Report disclaimer retrieval."""
692 self.to_screen(u'Retrieving disclaimer')
694 def report_download_webpage(self, video_id):
695 """Report webpage download."""
696 self.to_screen(u'%s: Downloading webpage' % video_id)
698 def _real_initialize(self):
699 # Retrieve disclaimer
700 request = compat_urllib_request.Request(self._DISCLAIMER)
702 self.report_disclaimer()
703 disclaimer = compat_urllib_request.urlopen(request).read()
704 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
705 self._downloader.report_error(u'unable to retrieve disclaimer: %s' % compat_str(err))
711 'submit': "Continue - I'm over 18",
713 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
715 self.report_age_confirmation()
716 disclaimer = compat_urllib_request.urlopen(request).read()
717 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
718 self._downloader.report_error(u'unable to confirm age: %s' % compat_str(err))
721 def _real_extract(self, url):
722 # Extract id and simplified title from URL
723 mobj = re.match(self._VALID_URL, url)
725 self._downloader.report_error(u'invalid URL: %s' % url)
728 video_id = mobj.group(1)
730 # Check if video comes from YouTube
731 mobj2 = re.match(r'^yt-(.*)$', video_id)
732 if mobj2 is not None:
733 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
735 # Retrieve video webpage to extract further information
736 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
738 # Extract URL, uploader and title from webpage
739 self.report_extraction(video_id)
740 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
742 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
743 video_extension = mediaURL[-3:]
745 # Extract gdaKey if available
746 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
750 gdaKey = mobj.group(1)
751 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
753 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
755 self._downloader.report_error(u'unable to extract media URL')
757 vardict = compat_parse_qs(mobj.group(1))
758 if 'mediaData' not in vardict:
759 self._downloader.report_error(u'unable to extract media URL')
761 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
763 self._downloader.report_error(u'unable to extract media URL')
765 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
766 video_extension = mediaURL[-3:]
767 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
769 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
771 self._downloader.report_error(u'unable to extract title')
773 video_title = mobj.group(1).decode('utf-8')
775 mobj = re.search(r'submitter=(.*?);', webpage)
777 self._downloader.report_error(u'unable to extract uploader nickname')
779 video_uploader = mobj.group(1)
782 'id': video_id.decode('utf-8'),
783 'url': video_url.decode('utf-8'),
784 'uploader': video_uploader.decode('utf-8'),
786 'title': video_title,
787 'ext': video_extension.decode('utf-8'),
791 class DailymotionIE(InfoExtractor):
792 """Information Extractor for Dailymotion"""
794 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
795 IE_NAME = u'dailymotion'
798 def __init__(self, downloader=None):
799 InfoExtractor.__init__(self, downloader)
801 def _real_extract(self, url):
802 # Extract id and simplified title from URL
803 mobj = re.match(self._VALID_URL, url)
805 self._downloader.report_error(u'invalid URL: %s' % url)
808 video_id = mobj.group(1).split('_')[0].split('?')[0]
810 video_extension = 'mp4'
812 # Retrieve video webpage to extract further information
813 request = compat_urllib_request.Request(url)
814 request.add_header('Cookie', 'family_filter=off')
815 webpage = self._download_webpage(request, video_id)
817 # Extract URL, uploader and title from webpage
818 self.report_extraction(video_id)
819 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
821 self._downloader.report_error(u'unable to extract media URL')
823 flashvars = compat_urllib_parse.unquote(mobj.group(1))
825 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
828 self.to_screen(u'Using %s' % key)
831 self._downloader.report_error(u'unable to extract video URL')
834 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
836 self._downloader.report_error(u'unable to extract video URL')
839 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
841 # TODO: support choosing qualities
843 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
845 self._downloader.report_error(u'unable to extract title')
847 video_title = unescapeHTML(mobj.group('title'))
849 video_uploader = None
850 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
852 # lookin for official user
853 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
854 if mobj_official is None:
855 self._downloader.report_warning(u'unable to extract uploader nickname')
857 video_uploader = mobj_official.group(1)
859 video_uploader = mobj.group(1)
861 video_upload_date = None
862 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
864 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
869 'uploader': video_uploader,
870 'upload_date': video_upload_date,
871 'title': video_title,
872 'ext': video_extension,
876 class PhotobucketIE(InfoExtractor):
877 """Information extractor for photobucket.com."""
879 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
880 IE_NAME = u'photobucket'
882 def __init__(self, downloader=None):
883 InfoExtractor.__init__(self, downloader)
885 def report_download_webpage(self, video_id):
886 """Report webpage download."""
887 self.to_screen(u'%s: Downloading webpage' % video_id)
889 def _real_extract(self, url):
890 # Extract id from URL
891 mobj = re.match(self._VALID_URL, url)
893 self._downloader.report_error(u'Invalid URL: %s' % url)
896 video_id = mobj.group(1)
898 video_extension = 'flv'
900 # Retrieve video webpage to extract further information
901 request = compat_urllib_request.Request(url)
903 self.report_download_webpage(video_id)
904 webpage = compat_urllib_request.urlopen(request).read()
905 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
906 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
909 # Extract URL, uploader, and title from webpage
910 self.report_extraction(video_id)
911 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
913 self._downloader.report_error(u'unable to extract media URL')
915 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
919 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
921 self._downloader.report_error(u'unable to extract title')
923 video_title = mobj.group(1).decode('utf-8')
925 video_uploader = mobj.group(2).decode('utf-8')
928 'id': video_id.decode('utf-8'),
929 'url': video_url.decode('utf-8'),
930 'uploader': video_uploader,
932 'title': video_title,
933 'ext': video_extension.decode('utf-8'),
937 class YahooIE(InfoExtractor):
938 """Information extractor for video.yahoo.com."""
941 # _VALID_URL matches all Yahoo! Video URLs
942 # _VPAGE_URL matches only the extractable '/watch/' URLs
943 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
944 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
945 IE_NAME = u'video.yahoo'
947 def __init__(self, downloader=None):
948 InfoExtractor.__init__(self, downloader)
950 def report_download_webpage(self, video_id):
951 """Report webpage download."""
952 self.to_screen(u'%s: Downloading webpage' % video_id)
954 def _real_extract(self, url, new_video=True):
955 # Extract ID from URL
956 mobj = re.match(self._VALID_URL, url)
958 self._downloader.report_error(u'Invalid URL: %s' % url)
961 video_id = mobj.group(2)
962 video_extension = 'flv'
964 # Rewrite valid but non-extractable URLs as
965 # extractable English language /watch/ URLs
966 if re.match(self._VPAGE_URL, url) is None:
967 request = compat_urllib_request.Request(url)
969 webpage = compat_urllib_request.urlopen(request).read()
970 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
971 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
974 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
976 self._downloader.report_error(u'Unable to extract id field')
978 yahoo_id = mobj.group(1)
980 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
982 self._downloader.report_error(u'Unable to extract vid field')
984 yahoo_vid = mobj.group(1)
986 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
987 return self._real_extract(url, new_video=False)
989 # Retrieve video webpage to extract further information
990 request = compat_urllib_request.Request(url)
992 self.report_download_webpage(video_id)
993 webpage = compat_urllib_request.urlopen(request).read()
994 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
995 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
998 # Extract uploader and title from webpage
999 self.report_extraction(video_id)
1000 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
1002 self._downloader.report_error(u'unable to extract video title')
1004 video_title = mobj.group(1).decode('utf-8')
1006 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
1008 self._downloader.report_error(u'unable to extract video uploader')
1010 video_uploader = mobj.group(1).decode('utf-8')
1012 # Extract video thumbnail
1013 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
1015 self._downloader.report_error(u'unable to extract video thumbnail')
1017 video_thumbnail = mobj.group(1).decode('utf-8')
1019 # Extract video description
1020 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
1022 self._downloader.report_error(u'unable to extract video description')
1024 video_description = mobj.group(1).decode('utf-8')
1025 if not video_description:
1026 video_description = 'No description available.'
1028 # Extract video height and width
1029 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
1031 self._downloader.report_error(u'unable to extract video height')
1033 yv_video_height = mobj.group(1)
1035 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
1037 self._downloader.report_error(u'unable to extract video width')
1039 yv_video_width = mobj.group(1)
1041 # Retrieve video playlist to extract media URL
1042 # I'm not completely sure what all these options are, but we
1043 # seem to need most of them, otherwise the server sends a 401.
1044 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
1045 yv_bitrate = '700' # according to Wikipedia this is hard-coded
1046 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
1047 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
1048 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
1050 self.report_download_webpage(video_id)
1051 webpage = compat_urllib_request.urlopen(request).read()
1052 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1053 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1056 # Extract media URL from playlist XML
1057 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1059 self._downloader.report_error(u'Unable to extract media URL')
1061 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1062 video_url = unescapeHTML(video_url)
1065 'id': video_id.decode('utf-8'),
1067 'uploader': video_uploader,
1068 'upload_date': None,
1069 'title': video_title,
1070 'ext': video_extension.decode('utf-8'),
1071 'thumbnail': video_thumbnail.decode('utf-8'),
1072 'description': video_description,
1076 class VimeoIE(InfoExtractor):
1077 """Information extractor for vimeo.com."""
1079 # _VALID_URL matches Vimeo URLs
1080 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1083 def __init__(self, downloader=None):
1084 InfoExtractor.__init__(self, downloader)
1086 def report_download_webpage(self, video_id):
1087 """Report webpage download."""
1088 self.to_screen(u'%s: Downloading webpage' % video_id)
1090 def _real_extract(self, url, new_video=True):
1091 # Extract ID from URL
1092 mobj = re.match(self._VALID_URL, url)
1094 self._downloader.report_error(u'Invalid URL: %s' % url)
1097 video_id = mobj.group('id')
1098 if not mobj.group('proto'):
1099 url = 'https://' + url
1100 if mobj.group('direct_link'):
1101 url = 'https://vimeo.com/' + video_id
1103 # Retrieve video webpage to extract further information
1104 request = compat_urllib_request.Request(url, None, std_headers)
1106 self.report_download_webpage(video_id)
1107 webpage_bytes = compat_urllib_request.urlopen(request).read()
1108 webpage = webpage_bytes.decode('utf-8')
1109 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1110 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1113 # Now we begin extracting as much information as we can from what we
1114 # retrieved. First we extract the information common to all extractors,
1115 # and latter we extract those that are Vimeo specific.
1116 self.report_extraction(video_id)
1118 # Extract the config JSON
1120 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1121 config = json.loads(config)
1123 self._downloader.report_error(u'unable to extract info section')
1127 video_title = config["video"]["title"]
1129 # Extract uploader and uploader_id
1130 video_uploader = config["video"]["owner"]["name"]
1131 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1133 # Extract video thumbnail
1134 video_thumbnail = config["video"]["thumbnail"]
1136 # Extract video description
1137 video_description = get_element_by_attribute("itemprop", "description", webpage)
1138 if video_description: video_description = clean_html(video_description)
1139 else: video_description = u''
1141 # Extract upload date
1142 video_upload_date = None
1143 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1144 if mobj is not None:
1145 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1147 # Vimeo specific: extract request signature and timestamp
1148 sig = config['request']['signature']
1149 timestamp = config['request']['timestamp']
1151 # Vimeo specific: extract video codec and quality information
1152 # First consider quality, then codecs, then take everything
1153 # TODO bind to format param
1154 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1155 files = { 'hd': [], 'sd': [], 'other': []}
1156 for codec_name, codec_extension in codecs:
1157 if codec_name in config["video"]["files"]:
1158 if 'hd' in config["video"]["files"][codec_name]:
1159 files['hd'].append((codec_name, codec_extension, 'hd'))
1160 elif 'sd' in config["video"]["files"][codec_name]:
1161 files['sd'].append((codec_name, codec_extension, 'sd'))
1163 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1165 for quality in ('hd', 'sd', 'other'):
1166 if len(files[quality]) > 0:
1167 video_quality = files[quality][0][2]
1168 video_codec = files[quality][0][0]
1169 video_extension = files[quality][0][1]
1170 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1173 self._downloader.report_error(u'no known codec found')
1176 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1177 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1182 'uploader': video_uploader,
1183 'uploader_id': video_uploader_id,
1184 'upload_date': video_upload_date,
1185 'title': video_title,
1186 'ext': video_extension,
1187 'thumbnail': video_thumbnail,
1188 'description': video_description,
1192 class ArteTvIE(InfoExtractor):
1193 """arte.tv information extractor."""
1195 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1196 _LIVE_URL = r'index-[0-9]+\.html$'
1198 IE_NAME = u'arte.tv'
1200 def __init__(self, downloader=None):
1201 InfoExtractor.__init__(self, downloader)
1203 def report_download_webpage(self, video_id):
1204 """Report webpage download."""
1205 self.to_screen(u'%s: Downloading webpage' % video_id)
1207 def fetch_webpage(self, url):
1208 request = compat_urllib_request.Request(url)
1210 self.report_download_webpage(url)
1211 webpage = compat_urllib_request.urlopen(request).read()
1212 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1213 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
1215 except ValueError as err:
1216 self._downloader.report_error(u'Invalid URL: %s' % url)
1220 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1221 page = self.fetch_webpage(url)
1222 mobj = re.search(regex, page, regexFlags)
1226 self._downloader.report_error(u'Invalid URL: %s' % url)
1229 for (i, key, err) in matchTuples:
1230 if mobj.group(i) is None:
1231 self._downloader.report_error(err)
1234 info[key] = mobj.group(i)
1238 def extractLiveStream(self, url):
1239 video_lang = url.split('/')[-4]
1240 info = self.grep_webpage(
1242 r'src="(.*?/videothek_js.*?\.js)',
1245 (1, 'url', u'Invalid URL: %s' % url)
1248 http_host = url.split('/')[2]
1249 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1250 info = self.grep_webpage(
1252 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1253 '(http://.*?\.swf).*?' +
1257 (1, 'path', u'could not extract video path: %s' % url),
1258 (2, 'player', u'could not extract video player: %s' % url),
1259 (3, 'url', u'could not extract video url: %s' % url)
1262 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1264 def extractPlus7Stream(self, url):
1265 video_lang = url.split('/')[-3]
1266 info = self.grep_webpage(
1268 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1271 (1, 'url', u'Invalid URL: %s' % url)
1274 next_url = compat_urllib_parse.unquote(info.get('url'))
1275 info = self.grep_webpage(
1277 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1280 (1, 'url', u'Could not find <video> tag: %s' % url)
1283 next_url = compat_urllib_parse.unquote(info.get('url'))
1285 info = self.grep_webpage(
1287 r'<video id="(.*?)".*?>.*?' +
1288 '<name>(.*?)</name>.*?' +
1289 '<dateVideo>(.*?)</dateVideo>.*?' +
1290 '<url quality="hd">(.*?)</url>',
1293 (1, 'id', u'could not extract video id: %s' % url),
1294 (2, 'title', u'could not extract video title: %s' % url),
1295 (3, 'date', u'could not extract video date: %s' % url),
1296 (4, 'url', u'could not extract video url: %s' % url)
1301 'id': info.get('id'),
1302 'url': compat_urllib_parse.unquote(info.get('url')),
1303 'uploader': u'arte.tv',
1304 'upload_date': info.get('date'),
1305 'title': info.get('title').decode('utf-8'),
1311 def _real_extract(self, url):
1312 video_id = url.split('/')[-1]
1313 self.report_extraction(video_id)
1315 if re.search(self._LIVE_URL, video_id) is not None:
1316 self.extractLiveStream(url)
1319 info = self.extractPlus7Stream(url)
1324 class GenericIE(InfoExtractor):
1325 """Generic last-resort information extractor."""
1328 IE_NAME = u'generic'
1330 def __init__(self, downloader=None):
1331 InfoExtractor.__init__(self, downloader)
1333 def report_download_webpage(self, video_id):
1334 """Report webpage download."""
1335 if not self._downloader.params.get('test', False):
1336 self._downloader.report_warning(u'Falling back on generic information extractor.')
1337 self.to_screen(u'%s: Downloading webpage' % video_id)
1339 def report_following_redirect(self, new_url):
1340 """Report information extraction."""
1341 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1343 def _test_redirect(self, url):
1344 """Check if it is a redirect, like url shorteners, in case return the new url."""
1345 class HeadRequest(compat_urllib_request.Request):
1346 def get_method(self):
1349 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1351 Subclass the HTTPRedirectHandler to make it use our
1352 HeadRequest also on the redirected URL
1354 def redirect_request(self, req, fp, code, msg, headers, newurl):
1355 if code in (301, 302, 303, 307):
1356 newurl = newurl.replace(' ', '%20')
1357 newheaders = dict((k,v) for k,v in req.headers.items()
1358 if k.lower() not in ("content-length", "content-type"))
1359 return HeadRequest(newurl,
1361 origin_req_host=req.get_origin_req_host(),
1364 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1366 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1368 Fallback to GET if HEAD is not allowed (405 HTTP error)
1370 def http_error_405(self, req, fp, code, msg, headers):
1374 newheaders = dict((k,v) for k,v in req.headers.items()
1375 if k.lower() not in ("content-length", "content-type"))
1376 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1378 origin_req_host=req.get_origin_req_host(),
1382 opener = compat_urllib_request.OpenerDirector()
1383 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1384 HTTPMethodFallback, HEADRedirectHandler,
1385 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1386 opener.add_handler(handler())
1388 response = opener.open(HeadRequest(url))
1389 new_url = response.geturl()
1394 self.report_following_redirect(new_url)
1397 def _real_extract(self, url):
1398 new_url = self._test_redirect(url)
1399 if new_url: return [self.url_result(new_url)]
1401 video_id = url.split('/')[-1]
1403 webpage = self._download_webpage(url, video_id)
1404 except ValueError as err:
1405 # since this is the last-resort InfoExtractor, if
1406 # this error is thrown, it'll be thrown here
1407 self._downloader.report_error(u'Invalid URL: %s' % url)
1410 self.report_extraction(video_id)
1411 # Start with something easy: JW Player in SWFObject
1412 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1414 # Broaden the search a little bit
1415 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1417 # Broaden the search a little bit: JWPlayer JS loader
1418 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1420 self._downloader.report_error(u'Invalid URL: %s' % url)
1423 # It's possible that one of the regexes
1424 # matched, but returned an empty group:
1425 if mobj.group(1) is None:
1426 self._downloader.report_error(u'Invalid URL: %s' % url)
1429 video_url = compat_urllib_parse.unquote(mobj.group(1))
1430 video_id = os.path.basename(video_url)
1432 # here's a fun little line of code for you:
1433 video_extension = os.path.splitext(video_id)[1][1:]
1434 video_id = os.path.splitext(video_id)[0]
1436 # it's tempting to parse this further, but you would
1437 # have to take into account all the variations like
1438 # Video Title - Site Name
1439 # Site Name | Video Title
1440 # Video Title - Tagline | Site Name
1441 # and so on and so forth; it's just not practical
1442 mobj = re.search(r'<title>(.*)</title>', webpage)
1444 self._downloader.report_error(u'unable to extract title')
1446 video_title = mobj.group(1)
1448 # video uploader is domain name
1449 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1451 self._downloader.report_error(u'unable to extract title')
1453 video_uploader = mobj.group(1)
1458 'uploader': video_uploader,
1459 'upload_date': None,
1460 'title': video_title,
1461 'ext': video_extension,
1465 class YoutubeSearchIE(InfoExtractor):
1466 """Information Extractor for YouTube search queries."""
1467 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1468 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1469 _max_youtube_results = 1000
1470 IE_NAME = u'youtube:search'
1472 def __init__(self, downloader=None):
1473 InfoExtractor.__init__(self, downloader)
1475 def report_download_page(self, query, pagenum):
1476 """Report attempt to download search page with given number."""
1477 query = query.decode(preferredencoding())
1478 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1480 def _real_extract(self, query):
1481 mobj = re.match(self._VALID_URL, query)
1483 self._downloader.report_error(u'invalid search query "%s"' % query)
1486 prefix, query = query.split(':')
1488 query = query.encode('utf-8')
1490 return self._get_n_results(query, 1)
1491 elif prefix == 'all':
1492 self._get_n_results(query, self._max_youtube_results)
1497 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1499 elif n > self._max_youtube_results:
1500 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1501 n = self._max_youtube_results
1502 return self._get_n_results(query, n)
1503 except ValueError: # parsing prefix as integer fails
1504 return self._get_n_results(query, 1)
1506 def _get_n_results(self, query, n):
1507 """Get a specified number of results for a query"""
1513 while (50 * pagenum) < limit:
1514 self.report_download_page(query, pagenum+1)
1515 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1516 request = compat_urllib_request.Request(result_url)
1518 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1519 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1520 self._downloader.report_error(u'unable to download API page: %s' % compat_str(err))
1522 api_response = json.loads(data)['data']
1524 if not 'items' in api_response:
1525 self._downloader.report_error(u'[youtube] No video results')
1528 new_ids = list(video['id'] for video in api_response['items'])
1529 video_ids += new_ids
1531 limit = min(n, api_response['totalItems'])
1534 if len(video_ids) > n:
1535 video_ids = video_ids[:n]
1536 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
1540 class GoogleSearchIE(InfoExtractor):
1541 """Information Extractor for Google Video search queries."""
1542 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1543 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1544 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1545 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1546 _max_google_results = 1000
1547 IE_NAME = u'video.google:search'
1549 def __init__(self, downloader=None):
1550 InfoExtractor.__init__(self, downloader)
1552 def report_download_page(self, query, pagenum):
1553 """Report attempt to download playlist page with given number."""
1554 query = query.decode(preferredencoding())
1555 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1557 def _real_extract(self, query):
1558 mobj = re.match(self._VALID_URL, query)
1560 self._downloader.report_error(u'invalid search query "%s"' % query)
1563 prefix, query = query.split(':')
1565 query = query.encode('utf-8')
1567 self._download_n_results(query, 1)
1569 elif prefix == 'all':
1570 self._download_n_results(query, self._max_google_results)
1576 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1578 elif n > self._max_google_results:
1579 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1580 n = self._max_google_results
1581 self._download_n_results(query, n)
1583 except ValueError: # parsing prefix as integer fails
1584 self._download_n_results(query, 1)
1587 def _download_n_results(self, query, n):
1588 """Downloads a specified number of results for a query"""
1594 self.report_download_page(query, pagenum)
1595 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1596 request = compat_urllib_request.Request(result_url)
1598 page = compat_urllib_request.urlopen(request).read()
1599 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1600 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1603 # Extract video identifiers
1604 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1605 video_id = mobj.group(1)
1606 if video_id not in video_ids:
1607 video_ids.append(video_id)
1608 if len(video_ids) == n:
1609 # Specified n videos reached
1610 for id in video_ids:
1611 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1614 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1615 for id in video_ids:
1616 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1619 pagenum = pagenum + 1
1622 class YahooSearchIE(InfoExtractor):
1623 """Information Extractor for Yahoo! Video search queries."""
1626 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1627 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1628 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1629 _MORE_PAGES_INDICATOR = r'\s*Next'
1630 _max_yahoo_results = 1000
1631 IE_NAME = u'video.yahoo:search'
1633 def __init__(self, downloader=None):
1634 InfoExtractor.__init__(self, downloader)
1636 def report_download_page(self, query, pagenum):
1637 """Report attempt to download playlist page with given number."""
1638 query = query.decode(preferredencoding())
1639 self.to_screen(u'query "%s": Downloading page %s' % (query, pagenum))
1641 def _real_extract(self, query):
1642 mobj = re.match(self._VALID_URL, query)
1644 self._downloader.report_error(u'invalid search query "%s"' % query)
1647 prefix, query = query.split(':')
1649 query = query.encode('utf-8')
1651 self._download_n_results(query, 1)
1653 elif prefix == 'all':
1654 self._download_n_results(query, self._max_yahoo_results)
1660 self._downloader.report_error(u'invalid download number %s for query "%s"' % (n, query))
1662 elif n > self._max_yahoo_results:
1663 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1664 n = self._max_yahoo_results
1665 self._download_n_results(query, n)
1667 except ValueError: # parsing prefix as integer fails
1668 self._download_n_results(query, 1)
1671 def _download_n_results(self, query, n):
1672 """Downloads a specified number of results for a query"""
1675 already_seen = set()
1679 self.report_download_page(query, pagenum)
1680 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1681 request = compat_urllib_request.Request(result_url)
1683 page = compat_urllib_request.urlopen(request).read()
1684 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1685 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1688 # Extract video identifiers
1689 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1690 video_id = mobj.group(1)
1691 if video_id not in already_seen:
1692 video_ids.append(video_id)
1693 already_seen.add(video_id)
1694 if len(video_ids) == n:
1695 # Specified n videos reached
1696 for id in video_ids:
1697 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1700 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1701 for id in video_ids:
1702 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1705 pagenum = pagenum + 1
1708 class YoutubePlaylistIE(InfoExtractor):
1709 """Information Extractor for YouTube playlists."""
1711 _VALID_URL = r"""(?:
1716 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1717 \? (?:.*?&)*? (?:p|a|list)=
1720 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1723 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1725 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1727 IE_NAME = u'youtube:playlist'
1729 def __init__(self, downloader=None):
1730 InfoExtractor.__init__(self, downloader)
1733 def suitable(cls, url):
1734 """Receives a URL and returns True if suitable for this IE."""
1735 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1737 def report_download_page(self, playlist_id, pagenum):
1738 """Report attempt to download playlist page with given number."""
1739 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1741 def _real_extract(self, url):
1742 # Extract playlist id
1743 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1745 self._downloader.report_error(u'invalid url: %s' % url)
1748 # Download playlist videos from API
1749 playlist_id = mobj.group(1) or mobj.group(2)
1754 self.report_download_page(playlist_id, page_num)
1756 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1758 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1759 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1760 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1764 response = json.loads(page)
1765 except ValueError as err:
1766 self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
1769 if 'feed' not in response:
1770 self._downloader.report_error(u'Got a malformed response from YouTube API')
1772 if 'entry' not in response['feed']:
1773 # Number of videos is a multiple of self._MAX_RESULTS
1776 playlist_title = response['feed']['title']['$t']
1778 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1779 for entry in response['feed']['entry']
1780 if 'content' in entry ]
1782 if len(response['feed']['entry']) < self._MAX_RESULTS:
1786 videos = [v[1] for v in sorted(videos)]
1788 url_results = [self.url_result(url, 'Youtube') for url in videos]
1789 return [self.playlist_result(url_results, playlist_id, playlist_title)]
1792 class YoutubeChannelIE(InfoExtractor):
1793 """Information Extractor for YouTube channels."""
1795 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1796 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1797 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1798 _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1799 IE_NAME = u'youtube:channel'
1801 def report_download_page(self, channel_id, pagenum):
1802 """Report attempt to download channel page with given number."""
1803 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1805 def extract_videos_from_page(self, page):
1807 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1808 if mobj.group(1) not in ids_in_page:
1809 ids_in_page.append(mobj.group(1))
1812 def _real_extract(self, url):
1813 # Extract channel id
1814 mobj = re.match(self._VALID_URL, url)
1816 self._downloader.report_error(u'invalid url: %s' % url)
1819 # Download channel page
1820 channel_id = mobj.group(1)
1824 self.report_download_page(channel_id, pagenum)
1825 url = self._TEMPLATE_URL % (channel_id, pagenum)
1826 request = compat_urllib_request.Request(url)
1828 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1829 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1830 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1833 # Extract video identifiers
1834 ids_in_page = self.extract_videos_from_page(page)
1835 video_ids.extend(ids_in_page)
1837 # Download any subsequent channel pages using the json-based channel_ajax query
1838 if self._MORE_PAGES_INDICATOR in page:
1840 pagenum = pagenum + 1
1842 self.report_download_page(channel_id, pagenum)
1843 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1844 request = compat_urllib_request.Request(url)
1846 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1847 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1848 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1851 page = json.loads(page)
1853 ids_in_page = self.extract_videos_from_page(page['content_html'])
1854 video_ids.extend(ids_in_page)
1856 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1859 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1861 urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
1862 url_entries = [self.url_result(url, 'Youtube') for url in urls]
1863 return [self.playlist_result(url_entries, channel_id)]
1866 class YoutubeUserIE(InfoExtractor):
1867 """Information Extractor for YouTube users."""
1869 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1870 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1871 _GDATA_PAGE_SIZE = 50
1872 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1873 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1874 IE_NAME = u'youtube:user'
1876 def __init__(self, downloader=None):
1877 InfoExtractor.__init__(self, downloader)
1879 def report_download_page(self, username, start_index):
1880 """Report attempt to download user page."""
1881 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1882 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1884 def _real_extract(self, url):
1886 mobj = re.match(self._VALID_URL, url)
1888 self._downloader.report_error(u'invalid url: %s' % url)
1891 username = mobj.group(1)
1893 # Download video ids using YouTube Data API. Result size per
1894 # query is limited (currently to 50 videos) so we need to query
1895 # page by page until there are no video ids - it means we got
1902 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1903 self.report_download_page(username, start_index)
1905 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1908 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1909 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1910 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1913 # Extract video identifiers
1916 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1917 if mobj.group(1) not in ids_in_page:
1918 ids_in_page.append(mobj.group(1))
1920 video_ids.extend(ids_in_page)
1922 # A little optimization - if current page is not
1923 # "full", ie. does not contain PAGE_SIZE video ids then
1924 # we can assume that this page is the last one - there
1925 # are no more ids on further pages - no need to query
1928 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1933 urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
1934 url_results = [self.url_result(url, 'Youtube') for url in urls]
1935 return [self.playlist_result(url_results, playlist_title = username)]
1938 class BlipTVUserIE(InfoExtractor):
1939 """Information Extractor for blip.tv users."""
1941 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1943 IE_NAME = u'blip.tv:user'
1945 def __init__(self, downloader=None):
1946 InfoExtractor.__init__(self, downloader)
1948 def report_download_page(self, username, pagenum):
1949 """Report attempt to download user page."""
1950 self.to_screen(u'user %s: Downloading video ids from page %d' %
1951 (username, pagenum))
1953 def _real_extract(self, url):
1955 mobj = re.match(self._VALID_URL, url)
1957 self._downloader.report_error(u'invalid url: %s' % url)
1960 username = mobj.group(1)
1962 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1964 request = compat_urllib_request.Request(url)
1967 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1968 mobj = re.search(r'data-users-id="([^"]+)"', page)
1969 page_base = page_base % mobj.group(1)
1970 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1971 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
1975 # Download video ids using BlipTV Ajax calls. Result size per
1976 # query is limited (currently to 12 videos) so we need to query
1977 # page by page until there are no video ids - it means we got
1984 self.report_download_page(username, pagenum)
1985 url = page_base + "&page=" + str(pagenum)
1986 request = compat_urllib_request.Request( url )
1988 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1989 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1990 self._downloader.report_error(u'unable to download webpage: %s' % str(err))
1993 # Extract video identifiers
1996 for mobj in re.finditer(r'href="/([^"]+)"', page):
1997 if mobj.group(1) not in ids_in_page:
1998 ids_in_page.append(unescapeHTML(mobj.group(1)))
2000 video_ids.extend(ids_in_page)
2002 # A little optimization - if current page is not
2003 # "full", ie. does not contain PAGE_SIZE video ids then
2004 # we can assume that this page is the last one - there
2005 # are no more ids on further pages - no need to query
2008 if len(ids_in_page) < self._PAGE_SIZE:
2013 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
2014 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
2015 return [self.playlist_result(url_entries, playlist_title = username)]
2018 class DepositFilesIE(InfoExtractor):
2019 """Information extractor for depositfiles.com"""
2021 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
2023 def report_download_webpage(self, file_id):
2024 """Report webpage download."""
2025 self.to_screen(u'%s: Downloading webpage' % file_id)
2027 def _real_extract(self, url):
2028 file_id = url.split('/')[-1]
2029 # Rebuild url in english locale
2030 url = 'http://depositfiles.com/en/files/' + file_id
2032 # Retrieve file webpage with 'Free download' button pressed
2033 free_download_indication = { 'gateway_result' : '1' }
2034 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2036 self.report_download_webpage(file_id)
2037 webpage = compat_urllib_request.urlopen(request).read()
2038 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2039 self._downloader.report_error(u'Unable to retrieve file webpage: %s' % compat_str(err))
2042 # Search for the real file URL
2043 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2044 if (mobj is None) or (mobj.group(1) is None):
2045 # Try to figure out reason of the error.
2046 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2047 if (mobj is not None) and (mobj.group(1) is not None):
2048 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2049 self._downloader.report_error(u'%s' % restriction_message)
2051 self._downloader.report_error(u'unable to extract download URL from: %s' % url)
2054 file_url = mobj.group(1)
2055 file_extension = os.path.splitext(file_url)[1][1:]
2057 # Search for file title
2058 mobj = re.search(r'<b title="(.*?)">', webpage)
2060 self._downloader.report_error(u'unable to extract title')
2062 file_title = mobj.group(1).decode('utf-8')
2065 'id': file_id.decode('utf-8'),
2066 'url': file_url.decode('utf-8'),
2068 'upload_date': None,
2069 'title': file_title,
2070 'ext': file_extension.decode('utf-8'),
2074 class FacebookIE(InfoExtractor):
2075 """Information Extractor for Facebook"""
2077 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2078 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2079 _NETRC_MACHINE = 'facebook'
2080 IE_NAME = u'facebook'
2082 def report_login(self):
2083 """Report attempt to log in."""
2084 self.to_screen(u'Logging in')
2086 def _real_initialize(self):
2087 if self._downloader is None:
2092 downloader_params = self._downloader.params
2094 # Attempt to use provided username and password or .netrc data
2095 if downloader_params.get('username', None) is not None:
2096 useremail = downloader_params['username']
2097 password = downloader_params['password']
2098 elif downloader_params.get('usenetrc', False):
2100 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2101 if info is not None:
2105 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2106 except (IOError, netrc.NetrcParseError) as err:
2107 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2110 if useremail is None:
2119 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2122 login_results = compat_urllib_request.urlopen(request).read()
2123 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2124 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2126 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2127 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2130 def _real_extract(self, url):
2131 mobj = re.match(self._VALID_URL, url)
2133 self._downloader.report_error(u'invalid URL: %s' % url)
2135 video_id = mobj.group('ID')
2137 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2138 webpage = self._download_webpage(url, video_id)
2140 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
2141 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2142 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2144 raise ExtractorError(u'Cannot parse data')
2145 data = dict(json.loads(m.group(1)))
2146 params_raw = compat_urllib_parse.unquote(data['params'])
2147 params = json.loads(params_raw)
2148 video_data = params['video_data'][0]
2149 video_url = video_data.get('hd_src')
2151 video_url = video_data['sd_src']
2153 raise ExtractorError(u'Cannot find video URL')
2154 video_duration = int(video_data['video_duration'])
2155 thumbnail = video_data['thumbnail_src']
2157 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2159 raise ExtractorError(u'Cannot find title in webpage')
2160 video_title = unescapeHTML(m.group(1))
2164 'title': video_title,
2167 'duration': video_duration,
2168 'thumbnail': thumbnail,
2173 class BlipTVIE(InfoExtractor):
2174 """Information extractor for blip.tv"""
2176 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2177 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2178 IE_NAME = u'blip.tv'
2180 def report_direct_download(self, title):
2181 """Report information extraction."""
2182 self.to_screen(u'%s: Direct download detected' % title)
2184 def _real_extract(self, url):
2185 mobj = re.match(self._VALID_URL, url)
2187 self._downloader.report_error(u'invalid URL: %s' % url)
2190 urlp = compat_urllib_parse_urlparse(url)
2191 if urlp.path.startswith('/play/'):
2192 request = compat_urllib_request.Request(url)
2193 response = compat_urllib_request.urlopen(request)
2194 redirecturl = response.geturl()
2195 rurlp = compat_urllib_parse_urlparse(redirecturl)
2196 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2197 url = 'http://blip.tv/a/a-' + file_id
2198 return self._real_extract(url)
2205 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2206 request = compat_urllib_request.Request(json_url)
2207 request.add_header('User-Agent', 'iTunes/10.6.1')
2208 self.report_extraction(mobj.group(1))
2211 urlh = compat_urllib_request.urlopen(request)
2212 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2213 basename = url.split('/')[-1]
2214 title,ext = os.path.splitext(basename)
2215 title = title.decode('UTF-8')
2216 ext = ext.replace('.', '')
2217 self.report_direct_download(title)
2222 'upload_date': None,
2227 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2228 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2229 if info is None: # Regular URL
2231 json_code_bytes = urlh.read()
2232 json_code = json_code_bytes.decode('utf-8')
2233 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2234 self._downloader.report_error(u'unable to read video info webpage: %s' % compat_str(err))
2238 json_data = json.loads(json_code)
2239 if 'Post' in json_data:
2240 data = json_data['Post']
2244 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2245 video_url = data['media']['url']
2246 umobj = re.match(self._URL_EXT, video_url)
2248 raise ValueError('Can not determine filename extension')
2249 ext = umobj.group(1)
2252 'id': data['item_id'],
2254 'uploader': data['display_name'],
2255 'upload_date': upload_date,
2256 'title': data['title'],
2258 'format': data['media']['mimeType'],
2259 'thumbnail': data['thumbnailUrl'],
2260 'description': data['description'],
2261 'player_url': data['embedUrl'],
2262 'user_agent': 'iTunes/10.6.1',
2264 except (ValueError,KeyError) as err:
2265 self._downloader.report_error(u'unable to parse video information: %s' % repr(err))
2271 class MyVideoIE(InfoExtractor):
2272 """Information Extractor for myvideo.de."""
2274 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2275 IE_NAME = u'myvideo'
2277 def __init__(self, downloader=None):
2278 InfoExtractor.__init__(self, downloader)
2280 def _real_extract(self,url):
2281 mobj = re.match(self._VALID_URL, url)
2283 self._download.report_error(u'invalid URL: %s' % url)
2286 video_id = mobj.group(1)
2289 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2290 webpage = self._download_webpage(webpage_url, video_id)
2292 self.report_extraction(video_id)
2293 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
2296 self._downloader.report_error(u'unable to extract media URL')
2298 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2300 mobj = re.search('<title>([^<]+)</title>', webpage)
2302 self._downloader.report_error(u'unable to extract title')
2305 video_title = mobj.group(1)
2311 'upload_date': None,
2312 'title': video_title,
2316 class ComedyCentralIE(InfoExtractor):
2317 """Information extractor for The Daily Show and Colbert Report """
2319 # urls can be abbreviations like :thedailyshow or :colbert
2320 # urls for episodes like:
2321 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2322 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2323 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2324 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2325 |(https?://)?(www\.)?
2326 (?P<showname>thedailyshow|colbertnation)\.com/
2327 (full-episodes/(?P<episode>.*)|
2329 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2330 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2333 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2335 _video_extensions = {
2343 _video_dimensions = {
2353 def suitable(cls, url):
2354 """Receives a URL and returns True if suitable for this IE."""
2355 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2357 def report_config_download(self, episode_id, media_id):
2358 self.to_screen(u'%s: Downloading configuration for %s' % (episode_id, media_id))
2360 def report_index_download(self, episode_id):
2361 self.to_screen(u'%s: Downloading show index' % episode_id)
2363 def _print_formats(self, formats):
2364 print('Available formats:')
2366 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2369 def _real_extract(self, url):
2370 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2372 self._downloader.report_error(u'invalid URL: %s' % url)
2375 if mobj.group('shortname'):
2376 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2377 url = u'http://www.thedailyshow.com/full-episodes/'
2379 url = u'http://www.colbertnation.com/full-episodes/'
2380 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2381 assert mobj is not None
2383 if mobj.group('clip'):
2384 if mobj.group('showname') == 'thedailyshow':
2385 epTitle = mobj.group('tdstitle')
2387 epTitle = mobj.group('cntitle')
2390 dlNewest = not mobj.group('episode')
2392 epTitle = mobj.group('showname')
2394 epTitle = mobj.group('episode')
2396 req = compat_urllib_request.Request(url)
2397 self.report_extraction(epTitle)
2399 htmlHandle = compat_urllib_request.urlopen(req)
2400 html = htmlHandle.read()
2401 webpage = html.decode('utf-8')
2402 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2403 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2406 url = htmlHandle.geturl()
2407 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2409 self._downloader.report_error(u'Invalid redirected URL: ' + url)
2411 if mobj.group('episode') == '':
2412 self._downloader.report_error(u'Redirected URL is still not specific: ' + url)
2414 epTitle = mobj.group('episode')
2416 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2418 if len(mMovieParams) == 0:
2419 # The Colbert Report embeds the information in a without
2420 # a URL prefix; so extract the alternate reference
2421 # and then add the URL prefix manually.
2423 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2424 if len(altMovieParams) == 0:
2425 self._downloader.report_error(u'unable to find Flash URL in webpage ' + url)
2428 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2430 uri = mMovieParams[0][1]
2431 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2432 self.report_index_download(epTitle)
2434 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2435 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2436 self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
2441 idoc = xml.etree.ElementTree.fromstring(indexXml)
2442 itemEls = idoc.findall('.//item')
2443 for partNum,itemEl in enumerate(itemEls):
2444 mediaId = itemEl.findall('./guid')[0].text
2445 shortMediaId = mediaId.split(':')[-1]
2446 showId = mediaId.split(':')[-2].replace('.com', '')
2447 officialTitle = itemEl.findall('./title')[0].text
2448 officialDate = itemEl.findall('./pubDate')[0].text
2450 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2451 compat_urllib_parse.urlencode({'uri': mediaId}))
2452 configReq = compat_urllib_request.Request(configUrl)
2453 self.report_config_download(epTitle, shortMediaId)
2455 configXml = compat_urllib_request.urlopen(configReq).read()
2456 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2457 self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
2460 cdoc = xml.etree.ElementTree.fromstring(configXml)
2462 for rendition in cdoc.findall('.//rendition'):
2463 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2467 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
2470 if self._downloader.params.get('listformats', None):
2471 self._print_formats([i[0] for i in turls])
2474 # For now, just pick the highest bitrate
2475 format,rtmp_video_url = turls[-1]
2477 # Get the format arg from the arg stream
2478 req_format = self._downloader.params.get('format', None)
2480 # Select format if we can find one
2483 format, rtmp_video_url = f, v
2486 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2488 raise ExtractorError(u'Cannot transform RTMP url')
2489 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2490 video_url = base + m.group('finalid')
2492 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2497 'upload_date': officialDate,
2502 'description': officialTitle,
2504 results.append(info)
2509 class EscapistIE(InfoExtractor):
2510 """Information extractor for The Escapist """
2512 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2513 IE_NAME = u'escapist'
2515 def report_config_download(self, showName):
2516 self.to_screen(u'%s: Downloading configuration' % showName)
2518 def _real_extract(self, url):
2519 mobj = re.match(self._VALID_URL, url)
2521 self._downloader.report_error(u'invalid URL: %s' % url)
2523 showName = mobj.group('showname')
2524 videoId = mobj.group('episode')
2526 self.report_extraction(showName)
2528 webPage = compat_urllib_request.urlopen(url)
2529 webPageBytes = webPage.read()
2530 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2531 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2532 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2533 self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
2536 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2537 description = unescapeHTML(descMatch.group(1))
2538 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2539 imgUrl = unescapeHTML(imgMatch.group(1))
2540 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2541 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2542 configUrlMatch = re.search('config=(.*)$', playerUrl)
2543 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2545 self.report_config_download(showName)
2547 configJSON = compat_urllib_request.urlopen(configUrl)
2548 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2549 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2550 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2551 self._downloader.report_error(u'unable to download configuration: ' + compat_str(err))
2554 # Technically, it's JavaScript, not JSON
2555 configJSON = configJSON.replace("'", '"')
2558 config = json.loads(configJSON)
2559 except (ValueError,) as err:
2560 self._downloader.report_error(u'Invalid JSON in configuration file: ' + compat_str(err))
2563 playlist = config['playlist']
2564 videoUrl = playlist[1]['url']
2569 'uploader': showName,
2570 'upload_date': None,
2573 'thumbnail': imgUrl,
2574 'description': description,
2575 'player_url': playerUrl,
2580 class CollegeHumorIE(InfoExtractor):
2581 """Information extractor for collegehumor.com"""
2584 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2585 IE_NAME = u'collegehumor'
2587 def report_manifest(self, video_id):
2588 """Report information extraction."""
2589 self.to_screen(u'%s: Downloading XML manifest' % video_id)
2591 def _real_extract(self, url):
2592 mobj = re.match(self._VALID_URL, url)
2594 self._downloader.report_error(u'invalid URL: %s' % url)
2596 video_id = mobj.group('videoid')
2601 'upload_date': None,
2604 self.report_extraction(video_id)
2605 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2607 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2608 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2609 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2612 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2614 videoNode = mdoc.findall('./video')[0]
2615 info['description'] = videoNode.findall('./description')[0].text
2616 info['title'] = videoNode.findall('./caption')[0].text
2617 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2618 manifest_url = videoNode.findall('./file')[0].text
2620 self._downloader.report_error(u'Invalid metadata XML file')
2623 manifest_url += '?hdcore=2.10.3'
2624 self.report_manifest(video_id)
2626 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2627 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2628 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
2631 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2633 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2634 node_id = media_node.attrib['url']
2635 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2636 except IndexError as err:
2637 self._downloader.report_error(u'Invalid manifest file')
2640 url_pr = compat_urllib_parse_urlparse(manifest_url)
2641 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2648 class XVideosIE(InfoExtractor):
2649 """Information extractor for xvideos.com"""
2651 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2652 IE_NAME = u'xvideos'
2654 def _real_extract(self, url):
2655 mobj = re.match(self._VALID_URL, url)
2657 self._downloader.report_error(u'invalid URL: %s' % url)
2659 video_id = mobj.group(1)
2661 webpage = self._download_webpage(url, video_id)
2663 self.report_extraction(video_id)
2667 mobj = re.search(r'flv_url=(.+?)&', webpage)
2669 self._downloader.report_error(u'unable to extract video url')
2671 video_url = compat_urllib_parse.unquote(mobj.group(1))
2675 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2677 self._downloader.report_error(u'unable to extract video title')
2679 video_title = mobj.group(1)
2682 # Extract video thumbnail
2683 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2685 self._downloader.report_error(u'unable to extract video thumbnail')
2687 video_thumbnail = mobj.group(0)
2693 'upload_date': None,
2694 'title': video_title,
2696 'thumbnail': video_thumbnail,
2697 'description': None,
2703 class SoundcloudIE(InfoExtractor):
2704 """Information extractor for soundcloud.com
2705 To access the media, the uid of the song and a stream token
2706 must be extracted from the page source and the script must make
2707 a request to media.soundcloud.com/crossdomain.xml. Then
2708 the media can be grabbed by requesting from an url composed
2709 of the stream token and uid
2712 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2713 IE_NAME = u'soundcloud'
2715 def __init__(self, downloader=None):
2716 InfoExtractor.__init__(self, downloader)
2718 def report_resolve(self, video_id):
2719 """Report information extraction."""
2720 self.to_screen(u'%s: Resolving id' % video_id)
2722 def _real_extract(self, url):
2723 mobj = re.match(self._VALID_URL, url)
2725 self._downloader.report_error(u'invalid URL: %s' % url)
2728 # extract uploader (which is in the url)
2729 uploader = mobj.group(1)
2730 # extract simple title (uploader + slug of song title)
2731 slug_title = mobj.group(2)
2732 simple_title = uploader + u'-' + slug_title
2734 self.report_resolve('%s/%s' % (uploader, slug_title))
2736 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2737 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2738 request = compat_urllib_request.Request(resolv_url)
2740 info_json_bytes = compat_urllib_request.urlopen(request).read()
2741 info_json = info_json_bytes.decode('utf-8')
2742 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2743 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2746 info = json.loads(info_json)
2747 video_id = info['id']
2748 self.report_extraction('%s/%s' % (uploader, slug_title))
2750 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2751 request = compat_urllib_request.Request(streams_url)
2753 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2754 stream_json = stream_json_bytes.decode('utf-8')
2755 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2756 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2759 streams = json.loads(stream_json)
2760 mediaURL = streams['http_mp3_128_url']
2765 'uploader': info['user']['username'],
2766 'upload_date': info['created_at'],
2767 'title': info['title'],
2769 'description': info['description'],
2772 class SoundcloudSetIE(InfoExtractor):
2773 """Information extractor for soundcloud.com sets
2774 To access the media, the uid of the song and a stream token
2775 must be extracted from the page source and the script must make
2776 a request to media.soundcloud.com/crossdomain.xml. Then
2777 the media can be grabbed by requesting from an url composed
2778 of the stream token and uid
2781 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
2782 IE_NAME = u'soundcloud'
2784 def __init__(self, downloader=None):
2785 InfoExtractor.__init__(self, downloader)
2787 def report_resolve(self, video_id):
2788 """Report information extraction."""
2789 self.to_screen(u'%s: Resolving id' % video_id)
2791 def _real_extract(self, url):
2792 mobj = re.match(self._VALID_URL, url)
2794 self._downloader.report_error(u'invalid URL: %s' % url)
2797 # extract uploader (which is in the url)
2798 uploader = mobj.group(1)
2799 # extract simple title (uploader + slug of song title)
2800 slug_title = mobj.group(2)
2801 simple_title = uploader + u'-' + slug_title
2803 self.report_resolve('%s/sets/%s' % (uploader, slug_title))
2805 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
2806 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2807 request = compat_urllib_request.Request(resolv_url)
2809 info_json_bytes = compat_urllib_request.urlopen(request).read()
2810 info_json = info_json_bytes.decode('utf-8')
2811 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2812 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err))
2816 info = json.loads(info_json)
2817 if 'errors' in info:
2818 for err in info['errors']:
2819 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
2822 for track in info['tracks']:
2823 video_id = track['id']
2824 self.report_extraction('%s/sets/%s' % (uploader, slug_title))
2826 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2827 request = compat_urllib_request.Request(streams_url)
2829 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2830 stream_json = stream_json_bytes.decode('utf-8')
2831 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2832 self._downloader.report_error(u'unable to download stream definitions: %s' % compat_str(err))
2835 streams = json.loads(stream_json)
2836 mediaURL = streams['http_mp3_128_url']
2841 'uploader': track['user']['username'],
2842 'upload_date': track['created_at'],
2843 'title': track['title'],
2845 'description': track['description'],
2850 class InfoQIE(InfoExtractor):
2851 """Information extractor for infoq.com"""
2852 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2854 def _real_extract(self, url):
2855 mobj = re.match(self._VALID_URL, url)
2857 self._downloader.report_error(u'invalid URL: %s' % url)
2860 webpage = self._download_webpage(url, video_id=url)
2861 self.report_extraction(url)
2864 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2866 self._downloader.report_error(u'unable to extract video url')
2868 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2869 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2872 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2874 self._downloader.report_error(u'unable to extract video title')
2876 video_title = mobj.group(1)
2878 # Extract description
2879 video_description = u'No description available.'
2880 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2881 if mobj is not None:
2882 video_description = mobj.group(1)
2884 video_filename = video_url.split('/')[-1]
2885 video_id, extension = video_filename.split('.')
2891 'upload_date': None,
2892 'title': video_title,
2893 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2895 'description': video_description,
2900 class MixcloudIE(InfoExtractor):
2901 """Information extractor for www.mixcloud.com"""
2903 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2904 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2905 IE_NAME = u'mixcloud'
2907 def __init__(self, downloader=None):
2908 InfoExtractor.__init__(self, downloader)
2910 def report_download_json(self, file_id):
2911 """Report JSON download."""
2912 self.to_screen(u'Downloading json')
2914 def get_urls(self, jsonData, fmt, bitrate='best'):
2915 """Get urls from 'audio_formats' section in json"""
2918 bitrate_list = jsonData[fmt]
2919 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2920 bitrate = max(bitrate_list) # select highest
2922 url_list = jsonData[fmt][bitrate]
2923 except TypeError: # we have no bitrate info.
2924 url_list = jsonData[fmt]
2927 def check_urls(self, url_list):
2928 """Returns 1st active url from list"""
2929 for url in url_list:
2931 compat_urllib_request.urlopen(url)
2933 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2938 def _print_formats(self, formats):
2939 print('Available formats:')
2940 for fmt in formats.keys():
2941 for b in formats[fmt]:
2943 ext = formats[fmt][b][0]
2944 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2945 except TypeError: # we have no bitrate info
2946 ext = formats[fmt][0]
2947 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2950 def _real_extract(self, url):
2951 mobj = re.match(self._VALID_URL, url)
2953 self._downloader.report_error(u'invalid URL: %s' % url)
2955 # extract uploader & filename from url
2956 uploader = mobj.group(1).decode('utf-8')
2957 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2959 # construct API request
2960 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2961 # retrieve .json file with links to files
2962 request = compat_urllib_request.Request(file_url)
2964 self.report_download_json(file_url)
2965 jsonData = compat_urllib_request.urlopen(request).read()
2966 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2967 self._downloader.report_error(u'Unable to retrieve file: %s' % compat_str(err))
2971 json_data = json.loads(jsonData)
2972 player_url = json_data['player_swf_url']
2973 formats = dict(json_data['audio_formats'])
2975 req_format = self._downloader.params.get('format', None)
2978 if self._downloader.params.get('listformats', None):
2979 self._print_formats(formats)
2982 if req_format is None or req_format == 'best':
2983 for format_param in formats.keys():
2984 url_list = self.get_urls(formats, format_param)
2986 file_url = self.check_urls(url_list)
2987 if file_url is not None:
2990 if req_format not in formats:
2991 self._downloader.report_error(u'format is not available')
2994 url_list = self.get_urls(formats, req_format)
2995 file_url = self.check_urls(url_list)
2996 format_param = req_format
2999 'id': file_id.decode('utf-8'),
3000 'url': file_url.decode('utf-8'),
3001 'uploader': uploader.decode('utf-8'),
3002 'upload_date': None,
3003 'title': json_data['name'],
3004 'ext': file_url.split('.')[-1].decode('utf-8'),
3005 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3006 'thumbnail': json_data['thumbnail_url'],
3007 'description': json_data['description'],
3008 'player_url': player_url.decode('utf-8'),
3011 class StanfordOpenClassroomIE(InfoExtractor):
3012 """Information extractor for Stanford's Open ClassRoom"""
3014 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3015 IE_NAME = u'stanfordoc'
3017 def report_download_webpage(self, objid):
3018 """Report information extraction."""
3019 self.to_screen(u'%s: Downloading webpage' % objid)
3021 def _real_extract(self, url):
3022 mobj = re.match(self._VALID_URL, url)
3024 raise ExtractorError(u'Invalid URL: %s' % url)
3026 if mobj.group('course') and mobj.group('video'): # A specific video
3027 course = mobj.group('course')
3028 video = mobj.group('video')
3030 'id': course + '_' + video,
3032 'upload_date': None,
3035 self.report_extraction(info['id'])
3036 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3037 xmlUrl = baseUrl + video + '.xml'
3039 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3040 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3041 self._downloader.report_error(u'unable to download video info XML: %s' % compat_str(err))
3043 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3045 info['title'] = mdoc.findall('./title')[0].text
3046 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3048 self._downloader.report_error(u'Invalid metadata XML file')
3050 info['ext'] = info['url'].rpartition('.')[2]
3052 elif mobj.group('course'): # A course page
3053 course = mobj.group('course')
3058 'upload_date': None,
3061 coursepage = self._download_webpage(url, info['id'],
3062 note='Downloading course info page',
3063 errnote='Unable to download course info page')
3065 m = re.search('<h1>([^<]+)</h1>', coursepage)
3067 info['title'] = unescapeHTML(m.group(1))
3069 info['title'] = info['id']
3071 m = re.search('<description>([^<]+)</description>', coursepage)
3073 info['description'] = unescapeHTML(m.group(1))
3075 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3078 'type': 'reference',
3079 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3083 for entry in info['list']:
3084 assert entry['type'] == 'reference'
3085 results += self.extract(entry['url'])
3089 'id': 'Stanford OpenClassroom',
3092 'upload_date': None,
3095 self.report_download_webpage(info['id'])
3096 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3098 rootpage = compat_urllib_request.urlopen(rootURL).read()
3099 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3100 self._downloader.report_error(u'unable to download course info page: ' + compat_str(err))
3103 info['title'] = info['id']
3105 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3108 'type': 'reference',
3109 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3114 for entry in info['list']:
3115 assert entry['type'] == 'reference'
3116 results += self.extract(entry['url'])
3119 class MTVIE(InfoExtractor):
3120 """Information extractor for MTV.com"""
3122 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3125 def _real_extract(self, url):
3126 mobj = re.match(self._VALID_URL, url)
3128 self._downloader.report_error(u'invalid URL: %s' % url)
3130 if not mobj.group('proto'):
3131 url = 'http://' + url
3132 video_id = mobj.group('videoid')
3134 webpage = self._download_webpage(url, video_id)
3136 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3138 self._downloader.report_error(u'unable to extract song name')
3140 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3141 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3143 self._downloader.report_error(u'unable to extract performer')
3145 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3146 video_title = performer + ' - ' + song_name
3148 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3150 self._downloader.report_error(u'unable to mtvn_uri')
3152 mtvn_uri = mobj.group(1)
3154 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3156 self._downloader.report_error(u'unable to extract content id')
3158 content_id = mobj.group(1)
3160 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3161 self.report_extraction(video_id)
3162 request = compat_urllib_request.Request(videogen_url)
3164 metadataXml = compat_urllib_request.urlopen(request).read()
3165 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3166 self._downloader.report_error(u'unable to download video metadata: %s' % compat_str(err))
3169 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3170 renditions = mdoc.findall('.//rendition')
3172 # For now, always pick the highest quality.
3173 rendition = renditions[-1]
3176 _,_,ext = rendition.attrib['type'].partition('/')
3177 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3178 video_url = rendition.find('./src').text
3180 self._downloader.report_error('Invalid rendition field.')
3186 'uploader': performer,
3187 'upload_date': None,
3188 'title': video_title,
3196 class YoukuIE(InfoExtractor):
3197 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3199 def report_download_webpage(self, file_id):
3200 """Report webpage download."""
3201 self.to_screen(u'%s: Downloading webpage' % file_id)
3204 nowTime = int(time.time() * 1000)
3205 random1 = random.randint(1000,1998)
3206 random2 = random.randint(1000,9999)
3208 return "%d%d%d" %(nowTime,random1,random2)
3210 def _get_file_ID_mix_string(self, seed):
3212 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3214 for i in range(len(source)):
3215 seed = (seed * 211 + 30031 ) % 65536
3216 index = math.floor(seed / 65536 * len(source) )
3217 mixed.append(source[int(index)])
3218 source.remove(source[int(index)])
3219 #return ''.join(mixed)
3222 def _get_file_id(self, fileId, seed):
3223 mixed = self._get_file_ID_mix_string(seed)
3224 ids = fileId.split('*')
3228 realId.append(mixed[int(ch)])
3229 return ''.join(realId)
3231 def _real_extract(self, url):
3232 mobj = re.match(self._VALID_URL, url)
3234 self._downloader.report_error(u'invalid URL: %s' % url)
3236 video_id = mobj.group('ID')
3238 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3240 request = compat_urllib_request.Request(info_url, None, std_headers)
3242 self.report_download_webpage(video_id)
3243 jsondata = compat_urllib_request.urlopen(request).read()
3244 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3245 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3248 self.report_extraction(video_id)
3250 jsonstr = jsondata.decode('utf-8')
3251 config = json.loads(jsonstr)
3253 video_title = config['data'][0]['title']
3254 seed = config['data'][0]['seed']
3256 format = self._downloader.params.get('format', None)
3257 supported_format = list(config['data'][0]['streamfileids'].keys())
3259 if format is None or format == 'best':
3260 if 'hd2' in supported_format:
3265 elif format == 'worst':
3273 fileid = config['data'][0]['streamfileids'][format]
3274 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3275 except (UnicodeDecodeError, ValueError, KeyError):
3276 self._downloader.report_error(u'unable to extract info section')
3280 sid = self._gen_sid()
3281 fileid = self._get_file_id(fileid, seed)
3283 #column 8,9 of fileid represent the segment number
3284 #fileid[7:9] should be changed
3285 for index, key in enumerate(keys):
3287 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3288 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3291 'id': '%s_part%02d' % (video_id, index),
3292 'url': download_url,
3294 'upload_date': None,
3295 'title': video_title,
3298 files_info.append(info)
3303 class XNXXIE(InfoExtractor):
3304 """Information extractor for xnxx.com"""
3306 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3308 VIDEO_URL_RE = r'flv_url=(.*?)&'
3309 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3310 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3312 def report_webpage(self, video_id):
3313 """Report information extraction"""
3314 self.to_screen(u'%s: Downloading webpage' % video_id)
3316 def _real_extract(self, url):
3317 mobj = re.match(self._VALID_URL, url)
3319 self._downloader.report_error(u'invalid URL: %s' % url)
3321 video_id = mobj.group(1)
3323 self.report_webpage(video_id)
3325 # Get webpage content
3327 webpage_bytes = compat_urllib_request.urlopen(url).read()
3328 webpage = webpage_bytes.decode('utf-8')
3329 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3330 self._downloader.report_error(u'unable to download video webpage: %s' % err)
3333 result = re.search(self.VIDEO_URL_RE, webpage)
3335 self._downloader.report_error(u'unable to extract video url')
3337 video_url = compat_urllib_parse.unquote(result.group(1))
3339 result = re.search(self.VIDEO_TITLE_RE, webpage)
3341 self._downloader.report_error(u'unable to extract video title')
3343 video_title = result.group(1)
3345 result = re.search(self.VIDEO_THUMB_RE, webpage)
3347 self._downloader.report_error(u'unable to extract video thumbnail')
3349 video_thumbnail = result.group(1)
3355 'upload_date': None,
3356 'title': video_title,
3358 'thumbnail': video_thumbnail,
3359 'description': None,
3363 class GooglePlusIE(InfoExtractor):
3364 """Information extractor for plus.google.com."""
3366 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3367 IE_NAME = u'plus.google'
3369 def __init__(self, downloader=None):
3370 InfoExtractor.__init__(self, downloader)
3372 def report_extract_entry(self, url):
3373 """Report downloading extry"""
3374 self.to_screen(u'Downloading entry: %s' % url)
3376 def report_date(self, upload_date):
3377 """Report downloading extry"""
3378 self.to_screen(u'Entry date: %s' % upload_date)
3380 def report_uploader(self, uploader):
3381 """Report downloading extry"""
3382 self.to_screen(u'Uploader: %s' % uploader)
3384 def report_title(self, video_title):
3385 """Report downloading extry"""
3386 self.to_screen(u'Title: %s' % video_title)
3388 def report_extract_vid_page(self, video_page):
3389 """Report information extraction."""
3390 self.to_screen(u'Extracting video page: %s' % video_page)
3392 def _real_extract(self, url):
3393 # Extract id from URL
3394 mobj = re.match(self._VALID_URL, url)
3396 self._downloader.report_error(u'Invalid URL: %s' % url)
3399 post_url = mobj.group(0)
3400 video_id = mobj.group(1)
3402 video_extension = 'flv'
3404 # Step 1, Retrieve post webpage to extract further information
3405 self.report_extract_entry(post_url)
3406 request = compat_urllib_request.Request(post_url)
3408 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3409 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3410 self._downloader.report_error(u'Unable to retrieve entry webpage: %s' % compat_str(err))
3413 # Extract update date
3415 pattern = 'title="Timestamp">(.*?)</a>'
3416 mobj = re.search(pattern, webpage)
3418 upload_date = mobj.group(1)
3419 # Convert timestring to a format suitable for filename
3420 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3421 upload_date = upload_date.strftime('%Y%m%d')
3422 self.report_date(upload_date)
3426 pattern = r'rel\="author".*?>(.*?)</a>'
3427 mobj = re.search(pattern, webpage)
3429 uploader = mobj.group(1)
3430 self.report_uploader(uploader)
3433 # Get the first line for title
3435 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3436 mobj = re.search(pattern, webpage)
3438 video_title = mobj.group(1)
3439 self.report_title(video_title)
3441 # Step 2, Stimulate clicking the image box to launch video
3442 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3443 mobj = re.search(pattern, webpage)
3445 self._downloader.report_error(u'unable to extract video page URL')
3447 video_page = mobj.group(1)
3448 request = compat_urllib_request.Request(video_page)
3450 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3451 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3452 self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
3454 self.report_extract_vid_page(video_page)
3457 # Extract video links on video page
3458 """Extract video links of all sizes"""
3459 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3460 mobj = re.findall(pattern, webpage)
3462 self._downloader.report_error(u'unable to extract video links')
3464 # Sort in resolution
3465 links = sorted(mobj)
3467 # Choose the lowest of the sort, i.e. highest resolution
3468 video_url = links[-1]
3469 # Only get the url. The resolution part in the tuple has no use anymore
3470 video_url = video_url[-1]
3471 # Treat escaped \u0026 style hex
3473 video_url = video_url.decode("unicode_escape")
3474 except AttributeError: # Python 3
3475 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3481 'uploader': uploader,
3482 'upload_date': upload_date,
3483 'title': video_title,
3484 'ext': video_extension,
3487 class NBAIE(InfoExtractor):
3488 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3491 def _real_extract(self, url):
3492 mobj = re.match(self._VALID_URL, url)
3494 self._downloader.report_error(u'invalid URL: %s' % url)
3497 video_id = mobj.group(1)
3498 if video_id.endswith('/index.html'):
3499 video_id = video_id[:-len('/index.html')]
3501 webpage = self._download_webpage(url, video_id)
3503 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3504 def _findProp(rexp, default=None):
3505 m = re.search(rexp, webpage)
3507 return unescapeHTML(m.group(1))
3511 shortened_video_id = video_id.rpartition('/')[2]
3512 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3514 'id': shortened_video_id,
3518 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3519 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3523 class JustinTVIE(InfoExtractor):
3524 """Information extractor for justin.tv and twitch.tv"""
3525 # TODO: One broadcast may be split into multiple videos. The key
3526 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3527 # starts at 1 and increases. Can we treat all parts as one video?
3529 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3530 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3531 _JUSTIN_PAGE_LIMIT = 100
3532 IE_NAME = u'justin.tv'
3534 def report_download_page(self, channel, offset):
3535 """Report attempt to download a single page of videos."""
3536 self.to_screen(u'%s: Downloading video information from %d to %d' %
3537 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3539 # Return count of items, list of *valid* items
3540 def _parse_page(self, url):
3542 urlh = compat_urllib_request.urlopen(url)
3543 webpage_bytes = urlh.read()
3544 webpage = webpage_bytes.decode('utf-8', 'ignore')
3545 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3546 self._downloader.report_error(u'unable to download video info JSON: %s' % compat_str(err))
3549 response = json.loads(webpage)
3550 if type(response) != list:
3551 error_text = response.get('error', 'unknown error')
3552 self._downloader.report_error(u'Justin.tv API: %s' % error_text)
3555 for clip in response:
3556 video_url = clip['video_file_url']
3558 video_extension = os.path.splitext(video_url)[1][1:]
3559 video_date = re.sub('-', '', clip['start_time'][:10])
3560 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3561 video_id = clip['id']
3562 video_title = clip.get('title', video_id)
3566 'title': video_title,
3567 'uploader': clip.get('channel_name', video_uploader_id),
3568 'uploader_id': video_uploader_id,
3569 'upload_date': video_date,
3570 'ext': video_extension,
3572 return (len(response), info)
3574 def _real_extract(self, url):
3575 mobj = re.match(self._VALID_URL, url)
3577 self._downloader.report_error(u'invalid URL: %s' % url)
3580 api = 'http://api.justin.tv'
3581 video_id = mobj.group(mobj.lastindex)
3583 if mobj.lastindex == 1:
3585 api += '/channel/archives/%s.json'
3587 api += '/broadcast/by_archive/%s.json'
3588 api = api % (video_id,)
3590 self.report_extraction(video_id)
3594 limit = self._JUSTIN_PAGE_LIMIT
3597 self.report_download_page(video_id, offset)
3598 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3599 page_count, page_info = self._parse_page(page_url)
3600 info.extend(page_info)
3601 if not paged or page_count != limit:
3606 class FunnyOrDieIE(InfoExtractor):
3607 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3609 def _real_extract(self, url):
3610 mobj = re.match(self._VALID_URL, url)
3612 self._downloader.report_error(u'invalid URL: %s' % url)
3615 video_id = mobj.group('id')
3616 webpage = self._download_webpage(url, video_id)
3618 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3620 self._downloader.report_error(u'unable to find video information')
3621 video_url = unescapeHTML(m.group('url'))
3623 m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
3625 m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
3627 self._downloader.report_error(u'Cannot find video title')
3628 title = clean_html(m.group('title'))
3630 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3632 desc = unescapeHTML(m.group('desc'))
3641 'description': desc,
3645 class SteamIE(InfoExtractor):
3646 _VALID_URL = r"""http://store.steampowered.com/
3647 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3649 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3653 def suitable(cls, url):
3654 """Receives a URL and returns True if suitable for this IE."""
3655 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3657 def _real_extract(self, url):
3658 m = re.match(self._VALID_URL, url, re.VERBOSE)
3659 gameID = m.group('gameID')
3660 videourl = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970' % gameID
3661 self.report_age_confirmation()
3662 webpage = self._download_webpage(videourl, gameID)
3663 game_title = re.search(r'<h2 class="pageheader">(?P<game_title>.*?)</h2>', webpage).group('game_title')
3665 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3666 mweb = re.finditer(urlRE, webpage)
3667 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3668 titles = re.finditer(namesRE, webpage)
3669 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3670 thumbs = re.finditer(thumbsRE, webpage)
3672 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3673 video_id = vid.group('videoID')
3674 title = vtitle.group('videoName')
3675 video_url = vid.group('videoURL')
3676 video_thumb = thumb.group('thumbnail')
3678 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3683 'title': unescapeHTML(title),
3684 'thumbnail': video_thumb
3687 return [self.playlist_result(videos, gameID, game_title)]
3689 class UstreamIE(InfoExtractor):
3690 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3691 IE_NAME = u'ustream'
3693 def _real_extract(self, url):
3694 m = re.match(self._VALID_URL, url)
3695 video_id = m.group('videoID')
3696 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3697 webpage = self._download_webpage(url, video_id)
3698 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3699 title = m.group('title')
3700 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3701 uploader = m.group('uploader')
3707 'uploader': uploader
3711 class WorldStarHipHopIE(InfoExtractor):
3712 _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
3713 IE_NAME = u'WorldStarHipHop'
3715 def _real_extract(self, url):
3716 _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
3718 webpage_src = compat_urllib_request.urlopen(url).read()
3719 webpage_src = webpage_src.decode('utf-8')
3721 mobj = re.search(_src_url, webpage_src)
3723 m = re.match(self._VALID_URL, url)
3724 video_id = m.group('id')
3726 if mobj is not None:
3727 video_url = mobj.group()
3728 if 'mp4' in video_url:
3733 self._downloader.report_error(u'Cannot find video url for %s' % video_id)
3736 _title = r"""<title>(.*)</title>"""
3738 mobj = re.search(_title, webpage_src)
3740 if mobj is not None:
3741 title = mobj.group(1)
3743 title = 'World Start Hip Hop - %s' % time.ctime()
3745 _thumbnail = r"""rel="image_src" href="(.*)" />"""
3746 mobj = re.search(_thumbnail, webpage_src)
3748 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
3749 if mobj is not None:
3750 thumbnail = mobj.group(1)
3752 _title = r"""candytitles.*>(.*)</span>"""
3753 mobj = re.search(_title, webpage_src)
3754 if mobj is not None:
3755 title = mobj.group(1)
3762 'thumbnail' : thumbnail,
3767 class RBMARadioIE(InfoExtractor):
3768 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3770 def _real_extract(self, url):
3771 m = re.match(self._VALID_URL, url)
3772 video_id = m.group('videoID')
3774 webpage = self._download_webpage(url, video_id)
3775 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3777 raise ExtractorError(u'Cannot find metadata')
3778 json_data = m.group(1)
3781 data = json.loads(json_data)
3782 except ValueError as e:
3783 raise ExtractorError(u'Invalid JSON: ' + str(e))
3785 video_url = data['akamai_url'] + '&cbr=256'
3786 url_parts = compat_urllib_parse_urlparse(video_url)
3787 video_ext = url_parts.path.rpartition('.')[2]
3792 'title': data['title'],
3793 'description': data.get('teaser_text'),
3794 'location': data.get('country_of_origin'),
3795 'uploader': data.get('host', {}).get('name'),
3796 'uploader_id': data.get('host', {}).get('slug'),
3797 'thumbnail': data.get('image', {}).get('large_url_2x'),
3798 'duration': data.get('duration'),
3803 class YouPornIE(InfoExtractor):
3804 """Information extractor for youporn.com."""
3805 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3807 def _print_formats(self, formats):
3808 """Print all available formats"""
3809 print(u'Available formats:')
3810 print(u'ext\t\tformat')
3811 print(u'---------------------------------')
3812 for format in formats:
3813 print(u'%s\t\t%s' % (format['ext'], format['format']))
3815 def _specific(self, req_format, formats):
3817 if(x["format"]==req_format):
3821 def _real_extract(self, url):
3822 mobj = re.match(self._VALID_URL, url)
3824 self._downloader.report_error(u'invalid URL: %s' % url)
3827 video_id = mobj.group('videoid')
3829 req = compat_urllib_request.Request(url)
3830 req.add_header('Cookie', 'age_verified=1')
3831 webpage = self._download_webpage(req, video_id)
3833 # Get the video title
3834 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3836 raise ExtractorError(u'Unable to extract video title')
3837 video_title = result.group('title').strip()
3839 # Get the video date
3840 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3842 self._downloader.report_warning(u'unable to extract video date')
3845 upload_date = result.group('date').strip()
3847 # Get the video uploader
3848 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3850 self._downloader.report_warning(u'unable to extract uploader')
3851 video_uploader = None
3853 video_uploader = result.group('uploader').strip()
3854 video_uploader = clean_html( video_uploader )
3856 # Get all of the formats available
3857 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3858 result = re.search(DOWNLOAD_LIST_RE, webpage)
3860 raise ExtractorError(u'Unable to extract download list')
3861 download_list_html = result.group('download_list').strip()
3863 # Get all of the links from the page
3864 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3865 links = re.findall(LINK_RE, download_list_html)
3866 if(len(links) == 0):
3867 raise ExtractorError(u'ERROR: no known formats available for video')
3869 self.to_screen(u'Links found: %d' % len(links))
3874 # A link looks like this:
3875 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3876 # A path looks like this:
3877 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3878 video_url = unescapeHTML( link )
3879 path = compat_urllib_parse_urlparse( video_url ).path
3880 extension = os.path.splitext( path )[1][1:]
3881 format = path.split('/')[4].split('_')[:2]
3884 format = "-".join( format )
3885 title = u'%s-%s-%s' % (video_title, size, bitrate)
3890 'uploader': video_uploader,
3891 'upload_date': upload_date,
3896 'description': None,
3900 if self._downloader.params.get('listformats', None):
3901 self._print_formats(formats)
3904 req_format = self._downloader.params.get('format', None)
3905 self.to_screen(u'Format: %s' % req_format)
3907 if req_format is None or req_format == 'best':
3909 elif req_format == 'worst':
3910 return [formats[-1]]
3911 elif req_format in ('-1', 'all'):
3914 format = self._specific( req_format, formats )
3916 self._downloader.report_error(u'requested format not available')
3922 class PornotubeIE(InfoExtractor):
3923 """Information extractor for pornotube.com."""
3924 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3926 def _real_extract(self, url):
3927 mobj = re.match(self._VALID_URL, url)
3929 self._downloader.report_error(u'invalid URL: %s' % url)
3932 video_id = mobj.group('videoid')
3933 video_title = mobj.group('title')
3935 # Get webpage content
3936 webpage = self._download_webpage(url, video_id)
3939 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3940 result = re.search(VIDEO_URL_RE, webpage)
3942 self._downloader.report_error(u'unable to extract video url')
3944 video_url = compat_urllib_parse.unquote(result.group('url'))
3946 #Get the uploaded date
3947 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3948 result = re.search(VIDEO_UPLOADED_RE, webpage)
3950 self._downloader.report_error(u'unable to extract video title')
3952 upload_date = result.group('date')
3954 info = {'id': video_id,
3957 'upload_date': upload_date,
3958 'title': video_title,
3964 class YouJizzIE(InfoExtractor):
3965 """Information extractor for youjizz.com."""
3966 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3968 def _real_extract(self, url):
3969 mobj = re.match(self._VALID_URL, url)
3971 self._downloader.report_error(u'invalid URL: %s' % url)
3974 video_id = mobj.group('videoid')
3976 # Get webpage content
3977 webpage = self._download_webpage(url, video_id)
3979 # Get the video title
3980 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3982 raise ExtractorError(u'ERROR: unable to extract video title')
3983 video_title = result.group('title').strip()
3985 # Get the embed page
3986 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3988 raise ExtractorError(u'ERROR: unable to extract embed page')
3990 embed_page_url = result.group(0).strip()
3991 video_id = result.group('videoid')
3993 webpage = self._download_webpage(embed_page_url, video_id)
3996 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3998 raise ExtractorError(u'ERROR: unable to extract video url')
3999 video_url = result.group('source')
4001 info = {'id': video_id,
4003 'title': video_title,
4006 'player_url': embed_page_url}
4010 class EightTracksIE(InfoExtractor):
4012 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
4014 def _real_extract(self, url):
4015 mobj = re.match(self._VALID_URL, url)
4017 raise ExtractorError(u'Invalid URL: %s' % url)
4018 playlist_id = mobj.group('id')
4020 webpage = self._download_webpage(url, playlist_id)
4022 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
4024 raise ExtractorError(u'Cannot find trax information')
4025 json_like = m.group(1)
4026 data = json.loads(json_like)
4028 session = str(random.randint(0, 1000000000))
4030 track_count = data['tracks_count']
4031 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
4032 next_url = first_url
4034 for i in itertools.count():
4035 api_json = self._download_webpage(next_url, playlist_id,
4036 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
4037 errnote=u'Failed to download song information')
4038 api_data = json.loads(api_json)
4039 track_data = api_data[u'set']['track']
4041 'id': track_data['id'],
4042 'url': track_data['track_file_stream_url'],
4043 'title': track_data['performer'] + u' - ' + track_data['name'],
4044 'raw_title': track_data['name'],
4045 'uploader_id': data['user']['login'],
4049 if api_data['set']['at_last_track']:
4051 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
4054 class KeekIE(InfoExtractor):
4055 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
4058 def _real_extract(self, url):
4059 m = re.match(self._VALID_URL, url)
4060 video_id = m.group('videoID')
4061 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
4062 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
4063 webpage = self._download_webpage(url, video_id)
4064 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4065 title = unescapeHTML(m.group('title'))
4066 m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
4067 uploader = clean_html(m.group('uploader'))
4073 'thumbnail': thumbnail,
4074 'uploader': uploader
4078 class TEDIE(InfoExtractor):
4079 _VALID_URL=r'''http://www.ted.com/
4081 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4083 ((?P<type_talk>talks)) # We have a simple talk
4085 /(?P<name>\w+) # Here goes the name and then ".html"
4089 def suitable(cls, url):
4090 """Receives a URL and returns True if suitable for this IE."""
4091 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4093 def _real_extract(self, url):
4094 m=re.match(self._VALID_URL, url, re.VERBOSE)
4095 if m.group('type_talk'):
4096 return [self._talk_info(url)]
4098 playlist_id=m.group('playlist_id')
4099 name=m.group('name')
4100 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
4101 return [self._playlist_videos_info(url,name,playlist_id)]
4103 def _talk_video_link(self,mediaSlug):
4104 '''Returns the video link for that mediaSlug'''
4105 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4107 def _playlist_videos_info(self,url,name,playlist_id=0):
4108 '''Returns the videos of the playlist'''
4110 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4111 ([.\s]*?)data-playlist_item_id="(\d+)"
4112 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4114 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4115 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4116 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4117 m_names=re.finditer(video_name_RE,webpage)
4119 playlist_RE = r'div class="headline">(\s*?)<h1>(\s*?)<span>(?P<playlist_title>.*?)</span>'
4120 m_playlist = re.search(playlist_RE, webpage)
4121 playlist_title = m_playlist.group('playlist_title')
4123 playlist_entries = []
4124 for m_video, m_name in zip(m_videos,m_names):
4125 video_id=m_video.group('video_id')
4126 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4127 playlist_entries.append(self.url_result(talk_url, 'TED'))
4128 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
4130 def _talk_info(self, url, video_id=0):
4131 """Return the video for the talk in the url"""
4132 m=re.match(self._VALID_URL, url,re.VERBOSE)
4133 videoName=m.group('name')
4134 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4135 # If the url includes the language we get the title translated
4136 title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
4137 title=re.search(title_RE, webpage).group('title')
4138 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4139 "id":(?P<videoID>[\d]+).*?
4140 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4141 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4142 thumb_match=re.search(thumb_RE,webpage)
4143 info_match=re.search(info_RE,webpage,re.VERBOSE)
4144 video_id=info_match.group('videoID')
4145 mediaSlug=info_match.group('mediaSlug')
4146 video_url=self._talk_video_link(mediaSlug)
4152 'thumbnail': thumb_match.group('thumbnail')
4156 class MySpassIE(InfoExtractor):
4157 _VALID_URL = r'http://www.myspass.de/.*'
4159 def _real_extract(self, url):
4160 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4162 # video id is the last path element of the URL
4163 # usually there is a trailing slash, so also try the second but last
4164 url_path = compat_urllib_parse_urlparse(url).path
4165 url_parent_path, video_id = os.path.split(url_path)
4167 _, video_id = os.path.split(url_parent_path)
4170 metadata_url = META_DATA_URL_TEMPLATE % video_id
4171 metadata_text = self._download_webpage(metadata_url, video_id)
4172 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4174 # extract values from metadata
4175 url_flv_el = metadata.find('url_flv')
4176 if url_flv_el is None:
4177 self._downloader.report_error(u'unable to extract download url')
4179 video_url = url_flv_el.text
4180 extension = os.path.splitext(video_url)[1][1:]
4181 title_el = metadata.find('title')
4182 if title_el is None:
4183 self._downloader.report_error(u'unable to extract title')
4185 title = title_el.text
4186 format_id_el = metadata.find('format_id')
4187 if format_id_el is None:
4190 format = format_id_el.text
4191 description_el = metadata.find('description')
4192 if description_el is not None:
4193 description = description_el.text
4196 imagePreview_el = metadata.find('imagePreview')
4197 if imagePreview_el is not None:
4198 thumbnail = imagePreview_el.text
4207 'thumbnail': thumbnail,
4208 'description': description
4212 class SpiegelIE(InfoExtractor):
4213 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
4215 def _real_extract(self, url):
4216 m = re.match(self._VALID_URL, url)
4217 video_id = m.group('videoID')
4219 webpage = self._download_webpage(url, video_id)
4220 m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
4222 raise ExtractorError(u'Cannot find title')
4223 video_title = unescapeHTML(m.group(1))
4225 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
4226 xml_code = self._download_webpage(xml_url, video_id,
4227 note=u'Downloading XML', errnote=u'Failed to download XML')
4229 idoc = xml.etree.ElementTree.fromstring(xml_code)
4230 last_type = idoc[-1]
4231 filename = last_type.findall('./filename')[0].text
4232 duration = float(last_type.findall('./duration')[0].text)
4234 video_url = 'http://video2.spiegel.de/flash/' + filename
4235 video_ext = filename.rpartition('.')[2]
4240 'title': video_title,
4241 'duration': duration,
4245 class LiveLeakIE(InfoExtractor):
4247 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
4248 IE_NAME = u'liveleak'
4250 def _real_extract(self, url):
4251 mobj = re.match(self._VALID_URL, url)
4253 self._downloader.report_error(u'invalid URL: %s' % url)
4256 video_id = mobj.group('video_id')
4258 webpage = self._download_webpage(url, video_id)
4260 m = re.search(r'file: "(.*?)",', webpage)
4262 self._downloader.report_error(u'unable to find video url')
4264 video_url = m.group(1)
4266 m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
4268 self._downloader.report_error(u'Cannot find video title')
4269 title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
4271 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
4273 desc = unescapeHTML(m.group('desc'))
4277 m = re.search(r'By:.*?(\w+)</a>', webpage)
4279 uploader = clean_html(m.group(1))
4288 'description': desc,
4289 'uploader': uploader
4294 class ARDIE(InfoExtractor):
4295 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
4296 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
4297 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
4299 def _real_extract(self, url):
4300 # determine video id from url
4301 m = re.match(self._VALID_URL, url)
4303 numid = re.search(r'documentId=([0-9]+)', url)
4305 video_id = numid.group(1)
4307 video_id = m.group('video_id')
4309 # determine title and media streams from webpage
4310 html = self._download_webpage(url, video_id)
4311 title = re.search(self._TITLE, html).group('title')
4312 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
4314 assert '"fsk"' in html
4315 self._downloader.report_error(u'this video is only available after 8:00 pm')
4318 # choose default media type and highest quality for now
4319 stream = max([s for s in streams if int(s["media_type"]) == 0],
4320 key=lambda s: int(s["quality"]))
4322 # there's two possibilities: RTMP stream or HTTP download
4323 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
4324 if stream['rtmp_url']:
4325 self.to_screen(u'RTMP download detected')
4326 assert stream['video_url'].startswith('mp4:')
4327 info["url"] = stream["rtmp_url"]
4328 info["play_path"] = stream['video_url']
4330 assert stream["video_url"].endswith('.mp4')
4331 info["url"] = stream["video_url"]
4334 class TumblrIE(InfoExtractor):
4335 _VALID_URL = r'http://(?P<blog_name>.*?).tumblr.com/((post)|(video))/(?P<id>\d*)/(.*?)'
4337 def _real_extract(self, url):
4338 m_url = re.match(self._VALID_URL, url)
4339 video_id = m_url.group('id')
4340 blog = m_url.group('blog_name')
4342 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
4343 webpage = self._download_webpage(url, video_id)
4345 re_video = r'src=\\x22(?P<video_url>http://%s.tumblr.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
4346 video = re.search(re_video, webpage)
4348 self.to_screen("No video founded")
4350 video_url = video.group('video_url')
4351 ext = video.group('ext')
4353 re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22' # We pick the first poster
4354 thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
4356 # The only place where you can get a title, it's not complete,
4357 # but searching in other places doesn't work for all videos
4358 re_title = r'<title>(.*?) - (?P<title>.*?)</title>'
4359 title = unescapeHTML(re.search(re_title, webpage).group('title'))
4361 return [{'id': video_id,
4369 def gen_extractors():
4370 """ Return a list of an instance of every supported extractor.
4371 The order does matter; the first extractor matched is the one handling the URL.
4374 YoutubePlaylistIE(),
4399 StanfordOpenClassroomIE(),
4409 WorldStarHipHopIE(),
4426 def get_info_extractor(ie_name):
4427 """Returns the info extractor class with the given ie_name"""
4428 return globals()[ie_name+'IE']