2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
13 import xml.etree.ElementTree
20 class InfoExtractor(object):
21 """Information Extractor class.
23 Information extractors are the classes that, given a URL, extract
24 information about the video (or videos) the URL refers to. This
25 information includes the real video URL, the video title, author and
26 others. The information is stored in a dictionary which is then
27 passed to the FileDownloader. The FileDownloader processes this
28 information possibly downloading the video to the file system, among
29 other possible outcomes.
31 The dictionaries must include the following fields:
35 title: Video title, unescaped.
36 ext: Video filename extension.
37 uploader: Full name of the video uploader.
38 upload_date: Video upload date (YYYYMMDD).
40 The following fields are optional:
42 format: The video format, defaults to ext (used for --get-format)
43 thumbnail: Full URL to a video thumbnail image.
44 description: One-line video description.
45 uploader_id: Nickname or id of the video uploader.
46 player_url: SWF Player URL (used for rtmpdump).
47 subtitles: The .srt file contents.
48 urlhandle: [internal] The urlHandle to be used to download the file,
49 like returned by urllib.request.urlopen
51 The fields should all be Unicode strings.
53 Subclasses of this one should re-define the _real_initialize() and
54 _real_extract() methods and define a _VALID_URL regexp.
55 Probably, they should also be added to the list of extractors.
57 _real_extract() must return a *list* of information dictionaries as
60 Finally, the _WORKING attribute should be set to False for broken IEs
61 in order to warn the users and skip the tests.
68 def __init__(self, downloader=None):
69 """Constructor. Receives an optional downloader."""
71 self.set_downloader(downloader)
73 def suitable(self, url):
74 """Receives a URL and returns True if suitable for this IE."""
75 return re.match(self._VALID_URL, url) is not None
78 """Getter method for _WORKING."""
82 """Initializes an instance (authentication, etc)."""
84 self._real_initialize()
87 def extract(self, url):
88 """Extracts URL information and returns it in list of dicts."""
90 return self._real_extract(url)
92 def set_downloader(self, downloader):
93 """Sets the downloader for this IE."""
94 self._downloader = downloader
96 def _real_initialize(self):
97 """Real initialization process. Redefine in subclasses."""
100 def _real_extract(self, url):
101 """Real extraction process. Redefine in subclasses."""
106 return type(self).__name__[:-2]
108 def _download_webpage(self, url, video_id, note=None, errnote=None):
110 note = u'Downloading video webpage'
111 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
113 urlh = compat_urllib_request.urlopen(url)
114 webpage_bytes = urlh.read()
115 return webpage_bytes.decode('utf-8', 'replace')
116 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
118 errnote = u'Unable to download webpage'
119 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)))
122 class YoutubeIE(InfoExtractor):
123 """Information extractor for youtube.com."""
127 (?:https?://)? # http(s):// (optional)
128 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
129 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
130 (?:.*?\#/)? # handle anchor (#/) redirect urls
131 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
132 (?: # the various things that can precede the ID:
133 (?:(?:v|embed|e)/) # v/ or embed/ or e/
134 |(?: # or the v= param in all its forms
135 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
136 (?:\?|\#!?) # the params delimiter ? or # or #!
137 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
140 )? # optional -> youtube.com/xxxx is OK
141 )? # all until now is optional -> you can pass the naked ID
142 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
143 (?(1).+)? # if we found the ID, everything can follow
145 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
146 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
147 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
148 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
149 _NETRC_MACHINE = 'youtube'
150 # Listed in order of quality
151 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
152 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
153 _video_extensions = {
159 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
165 _video_dimensions = {
183 def suitable(self, url):
184 """Receives a URL and returns True if suitable for this IE."""
185 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
187 def report_lang(self):
188 """Report attempt to set language."""
189 self._downloader.to_screen(u'[youtube] Setting language')
191 def report_login(self):
192 """Report attempt to log in."""
193 self._downloader.to_screen(u'[youtube] Logging in')
195 def report_age_confirmation(self):
196 """Report attempt to confirm age."""
197 self._downloader.to_screen(u'[youtube] Confirming age')
199 def report_video_webpage_download(self, video_id):
200 """Report attempt to download video webpage."""
201 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
203 def report_video_info_webpage_download(self, video_id):
204 """Report attempt to download video info webpage."""
205 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
207 def report_video_subtitles_download(self, video_id):
208 """Report attempt to download video info webpage."""
209 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
211 def report_information_extraction(self, video_id):
212 """Report attempt to extract video information."""
213 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
215 def report_unavailable_format(self, video_id, format):
216 """Report extracted video URL."""
217 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
219 def report_rtmp_download(self):
220 """Indicate the download will use the RTMP protocol."""
221 self._downloader.to_screen(u'[youtube] RTMP download detected')
223 def _closed_captions_xml_to_srt(self, xml_string):
225 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
226 # TODO parse xml instead of regex
227 for n, (start, dur_tag, dur, caption) in enumerate(texts):
228 if not dur: dur = '4'
230 end = start + float(dur)
231 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
232 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
233 caption = unescapeHTML(caption)
234 caption = unescapeHTML(caption) # double cycle, intentional
235 srt += str(n+1) + '\n'
236 srt += start + ' --> ' + end + '\n'
237 srt += caption + '\n\n'
240 def _extract_subtitles(self, video_id):
241 self.report_video_subtitles_download(video_id)
242 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
244 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
245 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
246 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
247 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
248 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
249 if not srt_lang_list:
250 return (u'WARNING: video has no closed captions', None)
251 if self._downloader.params.get('subtitleslang', False):
252 srt_lang = self._downloader.params.get('subtitleslang')
253 elif 'en' in srt_lang_list:
256 srt_lang = list(srt_lang_list.keys())[0]
257 if not srt_lang in srt_lang_list:
258 return (u'WARNING: no closed captions found in the specified language', None)
259 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
261 srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
262 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
263 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
265 return (u'WARNING: unable to download video subtitles', None)
266 return (None, self._closed_captions_xml_to_srt(srt_xml))
268 def _print_formats(self, formats):
269 print('Available formats:')
271 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
273 def _real_initialize(self):
274 if self._downloader is None:
279 downloader_params = self._downloader.params
281 # Attempt to use provided username and password or .netrc data
282 if downloader_params.get('username', None) is not None:
283 username = downloader_params['username']
284 password = downloader_params['password']
285 elif downloader_params.get('usenetrc', False):
287 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
292 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
293 except (IOError, netrc.NetrcParseError) as err:
294 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
298 request = compat_urllib_request.Request(self._LANG_URL)
301 compat_urllib_request.urlopen(request).read()
302 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
303 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
306 # No authentication to be performed
312 'current_form': 'loginForm',
314 'action_login': 'Log In',
315 'username': username,
316 'password': password,
318 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
321 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
322 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
323 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
325 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
326 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
332 'action_confirm': 'Confirm',
334 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
336 self.report_age_confirmation()
337 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
338 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
339 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
342 def _extract_id(self, url):
343 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
345 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
347 video_id = mobj.group(2)
350 def _real_extract(self, url):
351 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
352 mobj = re.search(self._NEXT_URL_RE, url)
354 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
355 video_id = self._extract_id(url)
358 self.report_video_webpage_download(video_id)
359 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
360 request = compat_urllib_request.Request(url)
362 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
363 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
364 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
367 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
369 # Attempt to extract SWF player URL
370 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
372 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
377 self.report_video_info_webpage_download(video_id)
378 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
379 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
380 % (video_id, el_type))
381 request = compat_urllib_request.Request(video_info_url)
383 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
384 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
385 video_info = compat_parse_qs(video_info_webpage)
386 if 'token' in video_info:
388 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
389 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
391 if 'token' not in video_info:
392 if 'reason' in video_info:
393 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
395 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
398 # Check for "rental" videos
399 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
400 self._downloader.trouble(u'ERROR: "rental" videos not supported')
403 # Start extracting information
404 self.report_information_extraction(video_id)
407 if 'author' not in video_info:
408 self._downloader.trouble(u'ERROR: unable to extract uploader name')
410 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
413 video_uploader_id = None
414 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/user/([^"]+)">', video_webpage)
416 video_uploader_id = mobj.group(1)
418 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
421 if 'title' not in video_info:
422 self._downloader.trouble(u'ERROR: unable to extract video title')
424 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
427 if 'thumbnail_url' not in video_info:
428 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
430 else: # don't panic if we can't find it
431 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
435 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
437 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
438 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
439 for expression in format_expressions:
441 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
446 video_description = get_element_by_id("eow-description", video_webpage)
447 if video_description:
448 video_description = clean_html(video_description)
450 video_description = ''
453 video_subtitles = None
454 if self._downloader.params.get('writesubtitles', False):
455 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
457 self._downloader.trouble(srt_error)
459 if 'length_seconds' not in video_info:
460 self._downloader.trouble(u'WARNING: unable to extract video duration')
463 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
466 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
468 # Decide which formats to download
469 req_format = self._downloader.params.get('format', None)
471 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
472 self.report_rtmp_download()
473 video_url_list = [(None, video_info['conn'][0])]
474 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
475 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
476 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
477 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
478 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
480 format_limit = self._downloader.params.get('format_limit', None)
481 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
482 if format_limit is not None and format_limit in available_formats:
483 format_list = available_formats[available_formats.index(format_limit):]
485 format_list = available_formats
486 existing_formats = [x for x in format_list if x in url_map]
487 if len(existing_formats) == 0:
488 self._downloader.trouble(u'ERROR: no known formats available for video')
490 if self._downloader.params.get('listformats', None):
491 self._print_formats(existing_formats)
493 if req_format is None or req_format == 'best':
494 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
495 elif req_format == 'worst':
496 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
497 elif req_format in ('-1', 'all'):
498 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
500 # Specific formats. We pick the first in a slash-delimeted sequence.
501 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
502 req_formats = req_format.split('/')
503 video_url_list = None
504 for rf in req_formats:
506 video_url_list = [(rf, url_map[rf])]
508 if video_url_list is None:
509 self._downloader.trouble(u'ERROR: requested format not available')
512 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
516 for format_param, video_real_url in video_url_list:
518 video_extension = self._video_extensions.get(format_param, 'flv')
520 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
521 self._video_dimensions.get(format_param, '???'))
525 'url': video_real_url,
526 'uploader': video_uploader,
527 'uploader_id': video_uploader_id,
528 'upload_date': upload_date,
529 'title': video_title,
530 'ext': video_extension,
531 'format': video_format,
532 'thumbnail': video_thumbnail,
533 'description': video_description,
534 'player_url': player_url,
535 'subtitles': video_subtitles,
536 'duration': video_duration
541 class MetacafeIE(InfoExtractor):
542 """Information Extractor for metacafe.com."""
544 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
545 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
546 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
547 IE_NAME = u'metacafe'
549 def __init__(self, downloader=None):
550 InfoExtractor.__init__(self, downloader)
552 def report_disclaimer(self):
553 """Report disclaimer retrieval."""
554 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
556 def report_age_confirmation(self):
557 """Report attempt to confirm age."""
558 self._downloader.to_screen(u'[metacafe] Confirming age')
560 def report_download_webpage(self, video_id):
561 """Report webpage download."""
562 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
564 def report_extraction(self, video_id):
565 """Report information extraction."""
566 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
568 def _real_initialize(self):
569 # Retrieve disclaimer
570 request = compat_urllib_request.Request(self._DISCLAIMER)
572 self.report_disclaimer()
573 disclaimer = compat_urllib_request.urlopen(request).read()
574 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
575 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
581 'submit': "Continue - I'm over 18",
583 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
585 self.report_age_confirmation()
586 disclaimer = compat_urllib_request.urlopen(request).read()
587 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
588 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
591 def _real_extract(self, url):
592 # Extract id and simplified title from URL
593 mobj = re.match(self._VALID_URL, url)
595 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
598 video_id = mobj.group(1)
600 # Check if video comes from YouTube
601 mobj2 = re.match(r'^yt-(.*)$', video_id)
602 if mobj2 is not None:
603 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
606 # Retrieve video webpage to extract further information
607 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
609 self.report_download_webpage(video_id)
610 webpage = compat_urllib_request.urlopen(request).read()
611 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
612 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
615 # Extract URL, uploader and title from webpage
616 self.report_extraction(video_id)
617 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
619 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
620 video_extension = mediaURL[-3:]
622 # Extract gdaKey if available
623 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
627 gdaKey = mobj.group(1)
628 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
630 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
632 self._downloader.trouble(u'ERROR: unable to extract media URL')
634 vardict = compat_parse_qs(mobj.group(1))
635 if 'mediaData' not in vardict:
636 self._downloader.trouble(u'ERROR: unable to extract media URL')
638 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
640 self._downloader.trouble(u'ERROR: unable to extract media URL')
642 mediaURL = mobj.group(1).replace('\\/', '/')
643 video_extension = mediaURL[-3:]
644 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
646 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
648 self._downloader.trouble(u'ERROR: unable to extract title')
650 video_title = mobj.group(1).decode('utf-8')
652 mobj = re.search(r'submitter=(.*?);', webpage)
654 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
656 video_uploader = mobj.group(1)
659 'id': video_id.decode('utf-8'),
660 'url': video_url.decode('utf-8'),
661 'uploader': video_uploader.decode('utf-8'),
663 'title': video_title,
664 'ext': video_extension.decode('utf-8'),
668 class DailymotionIE(InfoExtractor):
669 """Information Extractor for Dailymotion"""
671 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
672 IE_NAME = u'dailymotion'
674 def __init__(self, downloader=None):
675 InfoExtractor.__init__(self, downloader)
677 def report_download_webpage(self, video_id):
678 """Report webpage download."""
679 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
681 def report_extraction(self, video_id):
682 """Report information extraction."""
683 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
685 def _real_extract(self, url):
686 # Extract id and simplified title from URL
687 mobj = re.match(self._VALID_URL, url)
689 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
692 video_id = mobj.group(1).split('_')[0].split('?')[0]
694 video_extension = 'mp4'
696 # Retrieve video webpage to extract further information
697 request = compat_urllib_request.Request(url)
698 request.add_header('Cookie', 'family_filter=off')
700 self.report_download_webpage(video_id)
701 webpage_bytes = compat_urllib_request.urlopen(request).read()
702 webpage = webpage_bytes.decode('utf-8')
703 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
704 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
707 # Extract URL, uploader and title from webpage
708 self.report_extraction(video_id)
709 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
711 self._downloader.trouble(u'ERROR: unable to extract media URL')
713 flashvars = compat_urllib_parse.unquote(mobj.group(1))
715 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
718 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
721 self._downloader.trouble(u'ERROR: unable to extract video URL')
724 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
726 self._downloader.trouble(u'ERROR: unable to extract video URL')
729 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
731 # TODO: support choosing qualities
733 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
735 self._downloader.trouble(u'ERROR: unable to extract title')
737 video_title = unescapeHTML(mobj.group('title'))
739 video_uploader = None
740 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
742 # lookin for official user
743 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
744 if mobj_official is None:
745 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
747 video_uploader = mobj_official.group(1)
749 video_uploader = mobj.group(1)
751 video_upload_date = None
752 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
754 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
759 'uploader': video_uploader,
760 'upload_date': video_upload_date,
761 'title': video_title,
762 'ext': video_extension,
766 class PhotobucketIE(InfoExtractor):
767 """Information extractor for photobucket.com."""
769 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
770 IE_NAME = u'photobucket'
772 def __init__(self, downloader=None):
773 InfoExtractor.__init__(self, downloader)
775 def report_download_webpage(self, video_id):
776 """Report webpage download."""
777 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
779 def report_extraction(self, video_id):
780 """Report information extraction."""
781 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
783 def _real_extract(self, url):
784 # Extract id from URL
785 mobj = re.match(self._VALID_URL, url)
787 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
790 video_id = mobj.group(1)
792 video_extension = 'flv'
794 # Retrieve video webpage to extract further information
795 request = compat_urllib_request.Request(url)
797 self.report_download_webpage(video_id)
798 webpage = compat_urllib_request.urlopen(request).read()
799 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
800 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
803 # Extract URL, uploader, and title from webpage
804 self.report_extraction(video_id)
805 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
807 self._downloader.trouble(u'ERROR: unable to extract media URL')
809 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
813 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
815 self._downloader.trouble(u'ERROR: unable to extract title')
817 video_title = mobj.group(1).decode('utf-8')
819 video_uploader = mobj.group(2).decode('utf-8')
822 'id': video_id.decode('utf-8'),
823 'url': video_url.decode('utf-8'),
824 'uploader': video_uploader,
826 'title': video_title,
827 'ext': video_extension.decode('utf-8'),
831 class YahooIE(InfoExtractor):
832 """Information extractor for video.yahoo.com."""
835 # _VALID_URL matches all Yahoo! Video URLs
836 # _VPAGE_URL matches only the extractable '/watch/' URLs
837 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
838 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
839 IE_NAME = u'video.yahoo'
841 def __init__(self, downloader=None):
842 InfoExtractor.__init__(self, downloader)
844 def report_download_webpage(self, video_id):
845 """Report webpage download."""
846 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
848 def report_extraction(self, video_id):
849 """Report information extraction."""
850 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
852 def _real_extract(self, url, new_video=True):
853 # Extract ID from URL
854 mobj = re.match(self._VALID_URL, url)
856 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
859 video_id = mobj.group(2)
860 video_extension = 'flv'
862 # Rewrite valid but non-extractable URLs as
863 # extractable English language /watch/ URLs
864 if re.match(self._VPAGE_URL, url) is None:
865 request = compat_urllib_request.Request(url)
867 webpage = compat_urllib_request.urlopen(request).read()
868 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
869 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
872 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
874 self._downloader.trouble(u'ERROR: Unable to extract id field')
876 yahoo_id = mobj.group(1)
878 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
880 self._downloader.trouble(u'ERROR: Unable to extract vid field')
882 yahoo_vid = mobj.group(1)
884 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
885 return self._real_extract(url, new_video=False)
887 # Retrieve video webpage to extract further information
888 request = compat_urllib_request.Request(url)
890 self.report_download_webpage(video_id)
891 webpage = compat_urllib_request.urlopen(request).read()
892 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
893 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
896 # Extract uploader and title from webpage
897 self.report_extraction(video_id)
898 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
900 self._downloader.trouble(u'ERROR: unable to extract video title')
902 video_title = mobj.group(1).decode('utf-8')
904 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
906 self._downloader.trouble(u'ERROR: unable to extract video uploader')
908 video_uploader = mobj.group(1).decode('utf-8')
910 # Extract video thumbnail
911 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
913 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
915 video_thumbnail = mobj.group(1).decode('utf-8')
917 # Extract video description
918 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
920 self._downloader.trouble(u'ERROR: unable to extract video description')
922 video_description = mobj.group(1).decode('utf-8')
923 if not video_description:
924 video_description = 'No description available.'
926 # Extract video height and width
927 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
929 self._downloader.trouble(u'ERROR: unable to extract video height')
931 yv_video_height = mobj.group(1)
933 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
935 self._downloader.trouble(u'ERROR: unable to extract video width')
937 yv_video_width = mobj.group(1)
939 # Retrieve video playlist to extract media URL
940 # I'm not completely sure what all these options are, but we
941 # seem to need most of them, otherwise the server sends a 401.
942 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
943 yv_bitrate = '700' # according to Wikipedia this is hard-coded
944 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
945 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
946 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
948 self.report_download_webpage(video_id)
949 webpage = compat_urllib_request.urlopen(request).read()
950 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
951 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
954 # Extract media URL from playlist XML
955 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
957 self._downloader.trouble(u'ERROR: Unable to extract media URL')
959 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
960 video_url = unescapeHTML(video_url)
963 'id': video_id.decode('utf-8'),
965 'uploader': video_uploader,
967 'title': video_title,
968 'ext': video_extension.decode('utf-8'),
969 'thumbnail': video_thumbnail.decode('utf-8'),
970 'description': video_description,
974 class VimeoIE(InfoExtractor):
975 """Information extractor for vimeo.com."""
977 # _VALID_URL matches Vimeo URLs
978 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
981 def __init__(self, downloader=None):
982 InfoExtractor.__init__(self, downloader)
984 def report_download_webpage(self, video_id):
985 """Report webpage download."""
986 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
988 def report_extraction(self, video_id):
989 """Report information extraction."""
990 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
992 def _real_extract(self, url, new_video=True):
993 # Extract ID from URL
994 mobj = re.match(self._VALID_URL, url)
996 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
999 video_id = mobj.group(1)
1001 # Retrieve video webpage to extract further information
1002 request = compat_urllib_request.Request(url, None, std_headers)
1004 self.report_download_webpage(video_id)
1005 webpage_bytes = compat_urllib_request.urlopen(request).read()
1006 webpage = webpage_bytes.decode('utf-8')
1007 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1008 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1011 # Now we begin extracting as much information as we can from what we
1012 # retrieved. First we extract the information common to all extractors,
1013 # and latter we extract those that are Vimeo specific.
1014 self.report_extraction(video_id)
1016 # Extract the config JSON
1018 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1019 config = json.loads(config)
1021 self._downloader.trouble(u'ERROR: unable to extract info section')
1025 video_title = config["video"]["title"]
1027 # Extract uploader and uploader_id
1028 video_uploader = config["video"]["owner"]["name"]
1029 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1031 # Extract video thumbnail
1032 video_thumbnail = config["video"]["thumbnail"]
1034 # Extract video description
1035 video_description = get_element_by_attribute("itemprop", "description", webpage)
1036 if video_description: video_description = clean_html(video_description)
1037 else: video_description = ''
1039 # Extract upload date
1040 video_upload_date = None
1041 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1042 if mobj is not None:
1043 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1045 # Vimeo specific: extract request signature and timestamp
1046 sig = config['request']['signature']
1047 timestamp = config['request']['timestamp']
1049 # Vimeo specific: extract video codec and quality information
1050 # First consider quality, then codecs, then take everything
1051 # TODO bind to format param
1052 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1053 files = { 'hd': [], 'sd': [], 'other': []}
1054 for codec_name, codec_extension in codecs:
1055 if codec_name in config["video"]["files"]:
1056 if 'hd' in config["video"]["files"][codec_name]:
1057 files['hd'].append((codec_name, codec_extension, 'hd'))
1058 elif 'sd' in config["video"]["files"][codec_name]:
1059 files['sd'].append((codec_name, codec_extension, 'sd'))
1061 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1063 for quality in ('hd', 'sd', 'other'):
1064 if len(files[quality]) > 0:
1065 video_quality = files[quality][0][2]
1066 video_codec = files[quality][0][0]
1067 video_extension = files[quality][0][1]
1068 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1071 self._downloader.trouble(u'ERROR: no known codec found')
1074 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1075 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1080 'uploader': video_uploader,
1081 'uploader_id': video_uploader_id,
1082 'upload_date': video_upload_date,
1083 'title': video_title,
1084 'ext': video_extension,
1085 'thumbnail': video_thumbnail,
1086 'description': video_description,
1090 class ArteTvIE(InfoExtractor):
1091 """arte.tv information extractor."""
1093 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1094 _LIVE_URL = r'index-[0-9]+\.html$'
1096 IE_NAME = u'arte.tv'
1098 def __init__(self, downloader=None):
1099 InfoExtractor.__init__(self, downloader)
1101 def report_download_webpage(self, video_id):
1102 """Report webpage download."""
1103 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1105 def report_extraction(self, video_id):
1106 """Report information extraction."""
1107 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1109 def fetch_webpage(self, url):
1110 request = compat_urllib_request.Request(url)
1112 self.report_download_webpage(url)
1113 webpage = compat_urllib_request.urlopen(request).read()
1114 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1115 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1117 except ValueError as err:
1118 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1122 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1123 page = self.fetch_webpage(url)
1124 mobj = re.search(regex, page, regexFlags)
1128 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1131 for (i, key, err) in matchTuples:
1132 if mobj.group(i) is None:
1133 self._downloader.trouble(err)
1136 info[key] = mobj.group(i)
1140 def extractLiveStream(self, url):
1141 video_lang = url.split('/')[-4]
1142 info = self.grep_webpage(
1144 r'src="(.*?/videothek_js.*?\.js)',
1147 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1150 http_host = url.split('/')[2]
1151 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1152 info = self.grep_webpage(
1154 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1155 '(http://.*?\.swf).*?' +
1159 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1160 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1161 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1164 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1166 def extractPlus7Stream(self, url):
1167 video_lang = url.split('/')[-3]
1168 info = self.grep_webpage(
1170 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1173 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1176 next_url = compat_urllib_parse.unquote(info.get('url'))
1177 info = self.grep_webpage(
1179 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1182 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1185 next_url = compat_urllib_parse.unquote(info.get('url'))
1187 info = self.grep_webpage(
1189 r'<video id="(.*?)".*?>.*?' +
1190 '<name>(.*?)</name>.*?' +
1191 '<dateVideo>(.*?)</dateVideo>.*?' +
1192 '<url quality="hd">(.*?)</url>',
1195 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1196 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1197 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1198 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1203 'id': info.get('id'),
1204 'url': compat_urllib_parse.unquote(info.get('url')),
1205 'uploader': u'arte.tv',
1206 'upload_date': info.get('date'),
1207 'title': info.get('title').decode('utf-8'),
1213 def _real_extract(self, url):
1214 video_id = url.split('/')[-1]
1215 self.report_extraction(video_id)
1217 if re.search(self._LIVE_URL, video_id) is not None:
1218 self.extractLiveStream(url)
1221 info = self.extractPlus7Stream(url)
1226 class GenericIE(InfoExtractor):
1227 """Generic last-resort information extractor."""
1230 IE_NAME = u'generic'
1232 def __init__(self, downloader=None):
1233 InfoExtractor.__init__(self, downloader)
1235 def report_download_webpage(self, video_id):
1236 """Report webpage download."""
1237 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1238 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1240 def report_extraction(self, video_id):
1241 """Report information extraction."""
1242 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1244 def report_following_redirect(self, new_url):
1245 """Report information extraction."""
1246 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1248 def _test_redirect(self, url):
1249 """Check if it is a redirect, like url shorteners, in case restart chain."""
1250 class HeadRequest(compat_urllib_request.Request):
1251 def get_method(self):
1254 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1256 Subclass the HTTPRedirectHandler to make it use our
1257 HeadRequest also on the redirected URL
1259 def redirect_request(self, req, fp, code, msg, headers, newurl):
1260 if code in (301, 302, 303, 307):
1261 newurl = newurl.replace(' ', '%20')
1262 newheaders = dict((k,v) for k,v in req.headers.items()
1263 if k.lower() not in ("content-length", "content-type"))
1264 return HeadRequest(newurl,
1266 origin_req_host=req.get_origin_req_host(),
1269 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1271 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1273 Fallback to GET if HEAD is not allowed (405 HTTP error)
1275 def http_error_405(self, req, fp, code, msg, headers):
1279 newheaders = dict((k,v) for k,v in req.headers.items()
1280 if k.lower() not in ("content-length", "content-type"))
1281 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1283 origin_req_host=req.get_origin_req_host(),
1287 opener = compat_urllib_request.OpenerDirector()
1288 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1289 HTTPMethodFallback, HEADRedirectHandler,
1290 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1291 opener.add_handler(handler())
1293 response = opener.open(HeadRequest(url))
1294 new_url = response.geturl()
1299 self.report_following_redirect(new_url)
1300 self._downloader.download([new_url])
1303 def _real_extract(self, url):
1304 if self._test_redirect(url): return
1306 video_id = url.split('/')[-1]
1307 request = compat_urllib_request.Request(url)
1309 self.report_download_webpage(video_id)
1310 webpage = compat_urllib_request.urlopen(request).read()
1311 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1312 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1314 except ValueError as err:
1315 # since this is the last-resort InfoExtractor, if
1316 # this error is thrown, it'll be thrown here
1317 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1320 self.report_extraction(video_id)
1321 # Start with something easy: JW Player in SWFObject
1322 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1324 # Broaden the search a little bit
1325 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1327 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1330 # It's possible that one of the regexes
1331 # matched, but returned an empty group:
1332 if mobj.group(1) is None:
1333 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1336 video_url = compat_urllib_parse.unquote(mobj.group(1))
1337 video_id = os.path.basename(video_url)
1339 # here's a fun little line of code for you:
1340 video_extension = os.path.splitext(video_id)[1][1:]
1341 video_id = os.path.splitext(video_id)[0]
1343 # it's tempting to parse this further, but you would
1344 # have to take into account all the variations like
1345 # Video Title - Site Name
1346 # Site Name | Video Title
1347 # Video Title - Tagline | Site Name
1348 # and so on and so forth; it's just not practical
1349 mobj = re.search(r'<title>(.*)</title>', webpage)
1351 self._downloader.trouble(u'ERROR: unable to extract title')
1353 video_title = mobj.group(1)
1355 # video uploader is domain name
1356 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1358 self._downloader.trouble(u'ERROR: unable to extract title')
1360 video_uploader = mobj.group(1)
1365 'uploader': video_uploader,
1366 'upload_date': None,
1367 'title': video_title,
1368 'ext': video_extension,
1372 class YoutubeSearchIE(InfoExtractor):
1373 """Information Extractor for YouTube search queries."""
1374 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1375 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1376 _max_youtube_results = 1000
1377 IE_NAME = u'youtube:search'
1379 def __init__(self, downloader=None):
1380 InfoExtractor.__init__(self, downloader)
1382 def report_download_page(self, query, pagenum):
1383 """Report attempt to download search page with given number."""
1384 query = query.decode(preferredencoding())
1385 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1387 def _real_extract(self, query):
1388 mobj = re.match(self._VALID_URL, query)
1390 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1393 prefix, query = query.split(':')
1395 query = query.encode('utf-8')
1397 self._download_n_results(query, 1)
1399 elif prefix == 'all':
1400 self._download_n_results(query, self._max_youtube_results)
1406 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1408 elif n > self._max_youtube_results:
1409 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1410 n = self._max_youtube_results
1411 self._download_n_results(query, n)
1413 except ValueError: # parsing prefix as integer fails
1414 self._download_n_results(query, 1)
1417 def _download_n_results(self, query, n):
1418 """Downloads a specified number of results for a query"""
1424 while (50 * pagenum) < limit:
1425 self.report_download_page(query, pagenum+1)
1426 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1427 request = compat_urllib_request.Request(result_url)
1429 data = compat_urllib_request.urlopen(request).read()
1430 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1431 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1433 api_response = json.loads(data)['data']
1435 new_ids = list(video['id'] for video in api_response['items'])
1436 video_ids += new_ids
1438 limit = min(n, api_response['totalItems'])
1441 if len(video_ids) > n:
1442 video_ids = video_ids[:n]
1443 for id in video_ids:
1444 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1448 class GoogleSearchIE(InfoExtractor):
1449 """Information Extractor for Google Video search queries."""
1450 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1451 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1452 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1453 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1454 _max_google_results = 1000
1455 IE_NAME = u'video.google:search'
1457 def __init__(self, downloader=None):
1458 InfoExtractor.__init__(self, downloader)
1460 def report_download_page(self, query, pagenum):
1461 """Report attempt to download playlist page with given number."""
1462 query = query.decode(preferredencoding())
1463 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1465 def _real_extract(self, query):
1466 mobj = re.match(self._VALID_URL, query)
1468 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1471 prefix, query = query.split(':')
1473 query = query.encode('utf-8')
1475 self._download_n_results(query, 1)
1477 elif prefix == 'all':
1478 self._download_n_results(query, self._max_google_results)
1484 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1486 elif n > self._max_google_results:
1487 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1488 n = self._max_google_results
1489 self._download_n_results(query, n)
1491 except ValueError: # parsing prefix as integer fails
1492 self._download_n_results(query, 1)
1495 def _download_n_results(self, query, n):
1496 """Downloads a specified number of results for a query"""
1502 self.report_download_page(query, pagenum)
1503 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1504 request = compat_urllib_request.Request(result_url)
1506 page = compat_urllib_request.urlopen(request).read()
1507 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1508 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1511 # Extract video identifiers
1512 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1513 video_id = mobj.group(1)
1514 if video_id not in video_ids:
1515 video_ids.append(video_id)
1516 if len(video_ids) == n:
1517 # Specified n videos reached
1518 for id in video_ids:
1519 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1522 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1523 for id in video_ids:
1524 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1527 pagenum = pagenum + 1
1530 class YahooSearchIE(InfoExtractor):
1531 """Information Extractor for Yahoo! Video search queries."""
1534 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1535 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1536 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1537 _MORE_PAGES_INDICATOR = r'\s*Next'
1538 _max_yahoo_results = 1000
1539 IE_NAME = u'video.yahoo:search'
1541 def __init__(self, downloader=None):
1542 InfoExtractor.__init__(self, downloader)
1544 def report_download_page(self, query, pagenum):
1545 """Report attempt to download playlist page with given number."""
1546 query = query.decode(preferredencoding())
1547 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1549 def _real_extract(self, query):
1550 mobj = re.match(self._VALID_URL, query)
1552 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1555 prefix, query = query.split(':')
1557 query = query.encode('utf-8')
1559 self._download_n_results(query, 1)
1561 elif prefix == 'all':
1562 self._download_n_results(query, self._max_yahoo_results)
1568 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1570 elif n > self._max_yahoo_results:
1571 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1572 n = self._max_yahoo_results
1573 self._download_n_results(query, n)
1575 except ValueError: # parsing prefix as integer fails
1576 self._download_n_results(query, 1)
1579 def _download_n_results(self, query, n):
1580 """Downloads a specified number of results for a query"""
1583 already_seen = set()
1587 self.report_download_page(query, pagenum)
1588 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1589 request = compat_urllib_request.Request(result_url)
1591 page = compat_urllib_request.urlopen(request).read()
1592 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1593 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1596 # Extract video identifiers
1597 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1598 video_id = mobj.group(1)
1599 if video_id not in already_seen:
1600 video_ids.append(video_id)
1601 already_seen.add(video_id)
1602 if len(video_ids) == n:
1603 # Specified n videos reached
1604 for id in video_ids:
1605 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1608 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1609 for id in video_ids:
1610 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1613 pagenum = pagenum + 1
1616 class YoutubePlaylistIE(InfoExtractor):
1617 """Information Extractor for YouTube playlists."""
1619 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1620 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1621 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1622 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1623 IE_NAME = u'youtube:playlist'
1625 def __init__(self, downloader=None):
1626 InfoExtractor.__init__(self, downloader)
1628 def report_download_page(self, playlist_id, pagenum):
1629 """Report attempt to download playlist page with given number."""
1630 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1632 def _real_extract(self, url):
1633 # Extract playlist id
1634 mobj = re.match(self._VALID_URL, url)
1636 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1640 if mobj.group(3) is not None:
1641 self._downloader.download([mobj.group(3)])
1644 # Download playlist pages
1645 # prefix is 'p' as default for playlists but there are other types that need extra care
1646 playlist_prefix = mobj.group(1)
1647 if playlist_prefix == 'a':
1648 playlist_access = 'artist'
1650 playlist_prefix = 'p'
1651 playlist_access = 'view_play_list'
1652 playlist_id = mobj.group(2)
1657 self.report_download_page(playlist_id, pagenum)
1658 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1659 request = compat_urllib_request.Request(url)
1661 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1662 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1663 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1666 # Extract video identifiers
1668 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1669 if mobj.group(1) not in ids_in_page:
1670 ids_in_page.append(mobj.group(1))
1671 video_ids.extend(ids_in_page)
1673 if self._MORE_PAGES_INDICATOR not in page:
1675 pagenum = pagenum + 1
1677 total = len(video_ids)
1679 playliststart = self._downloader.params.get('playliststart', 1) - 1
1680 playlistend = self._downloader.params.get('playlistend', -1)
1681 if playlistend == -1:
1682 video_ids = video_ids[playliststart:]
1684 video_ids = video_ids[playliststart:playlistend]
1686 if len(video_ids) == total:
1687 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1689 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1691 for id in video_ids:
1692 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1696 class YoutubeChannelIE(InfoExtractor):
1697 """Information Extractor for YouTube channels."""
1699 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1700 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1701 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1702 IE_NAME = u'youtube:channel'
1704 def report_download_page(self, channel_id, pagenum):
1705 """Report attempt to download channel page with given number."""
1706 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1708 def _real_extract(self, url):
1709 # Extract channel id
1710 mobj = re.match(self._VALID_URL, url)
1712 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1715 # Download channel pages
1716 channel_id = mobj.group(1)
1721 self.report_download_page(channel_id, pagenum)
1722 url = self._TEMPLATE_URL % (channel_id, pagenum)
1723 request = compat_urllib_request.Request(url)
1725 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1726 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1727 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1730 # Extract video identifiers
1732 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1733 if mobj.group(1) not in ids_in_page:
1734 ids_in_page.append(mobj.group(1))
1735 video_ids.extend(ids_in_page)
1737 if self._MORE_PAGES_INDICATOR not in page:
1739 pagenum = pagenum + 1
1741 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1743 for id in video_ids:
1744 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1748 class YoutubeUserIE(InfoExtractor):
1749 """Information Extractor for YouTube users."""
1751 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1752 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1753 _GDATA_PAGE_SIZE = 50
1754 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1755 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1756 IE_NAME = u'youtube:user'
1758 def __init__(self, downloader=None):
1759 InfoExtractor.__init__(self, downloader)
1761 def report_download_page(self, username, start_index):
1762 """Report attempt to download user page."""
1763 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1764 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1766 def _real_extract(self, url):
1768 mobj = re.match(self._VALID_URL, url)
1770 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1773 username = mobj.group(1)
1775 # Download video ids using YouTube Data API. Result size per
1776 # query is limited (currently to 50 videos) so we need to query
1777 # page by page until there are no video ids - it means we got
1784 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1785 self.report_download_page(username, start_index)
1787 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1790 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1791 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1792 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1795 # Extract video identifiers
1798 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1799 if mobj.group(1) not in ids_in_page:
1800 ids_in_page.append(mobj.group(1))
1802 video_ids.extend(ids_in_page)
1804 # A little optimization - if current page is not
1805 # "full", ie. does not contain PAGE_SIZE video ids then
1806 # we can assume that this page is the last one - there
1807 # are no more ids on further pages - no need to query
1810 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1815 all_ids_count = len(video_ids)
1816 playliststart = self._downloader.params.get('playliststart', 1) - 1
1817 playlistend = self._downloader.params.get('playlistend', -1)
1819 if playlistend == -1:
1820 video_ids = video_ids[playliststart:]
1822 video_ids = video_ids[playliststart:playlistend]
1824 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1825 (username, all_ids_count, len(video_ids)))
1827 for video_id in video_ids:
1828 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1831 class BlipTVUserIE(InfoExtractor):
1832 """Information Extractor for blip.tv users."""
1834 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1836 IE_NAME = u'blip.tv:user'
1838 def __init__(self, downloader=None):
1839 InfoExtractor.__init__(self, downloader)
1841 def report_download_page(self, username, pagenum):
1842 """Report attempt to download user page."""
1843 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1844 (self.IE_NAME, username, pagenum))
1846 def _real_extract(self, url):
1848 mobj = re.match(self._VALID_URL, url)
1850 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1853 username = mobj.group(1)
1855 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1857 request = compat_urllib_request.Request(url)
1860 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1861 mobj = re.search(r'data-users-id="([^"]+)"', page)
1862 page_base = page_base % mobj.group(1)
1863 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1864 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1868 # Download video ids using BlipTV Ajax calls. Result size per
1869 # query is limited (currently to 12 videos) so we need to query
1870 # page by page until there are no video ids - it means we got
1877 self.report_download_page(username, pagenum)
1879 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1882 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1883 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1884 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1887 # Extract video identifiers
1890 for mobj in re.finditer(r'href="/([^"]+)"', page):
1891 if mobj.group(1) not in ids_in_page:
1892 ids_in_page.append(unescapeHTML(mobj.group(1)))
1894 video_ids.extend(ids_in_page)
1896 # A little optimization - if current page is not
1897 # "full", ie. does not contain PAGE_SIZE video ids then
1898 # we can assume that this page is the last one - there
1899 # are no more ids on further pages - no need to query
1902 if len(ids_in_page) < self._PAGE_SIZE:
1907 all_ids_count = len(video_ids)
1908 playliststart = self._downloader.params.get('playliststart', 1) - 1
1909 playlistend = self._downloader.params.get('playlistend', -1)
1911 if playlistend == -1:
1912 video_ids = video_ids[playliststart:]
1914 video_ids = video_ids[playliststart:playlistend]
1916 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1917 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1919 for video_id in video_ids:
1920 self._downloader.download([u'http://blip.tv/'+video_id])
1923 class DepositFilesIE(InfoExtractor):
1924 """Information extractor for depositfiles.com"""
1926 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1928 def report_download_webpage(self, file_id):
1929 """Report webpage download."""
1930 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1932 def report_extraction(self, file_id):
1933 """Report information extraction."""
1934 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1936 def _real_extract(self, url):
1937 file_id = url.split('/')[-1]
1938 # Rebuild url in english locale
1939 url = 'http://depositfiles.com/en/files/' + file_id
1941 # Retrieve file webpage with 'Free download' button pressed
1942 free_download_indication = { 'gateway_result' : '1' }
1943 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1945 self.report_download_webpage(file_id)
1946 webpage = compat_urllib_request.urlopen(request).read()
1947 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1948 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1951 # Search for the real file URL
1952 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1953 if (mobj is None) or (mobj.group(1) is None):
1954 # Try to figure out reason of the error.
1955 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1956 if (mobj is not None) and (mobj.group(1) is not None):
1957 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1958 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1960 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1963 file_url = mobj.group(1)
1964 file_extension = os.path.splitext(file_url)[1][1:]
1966 # Search for file title
1967 mobj = re.search(r'<b title="(.*?)">', webpage)
1969 self._downloader.trouble(u'ERROR: unable to extract title')
1971 file_title = mobj.group(1).decode('utf-8')
1974 'id': file_id.decode('utf-8'),
1975 'url': file_url.decode('utf-8'),
1977 'upload_date': None,
1978 'title': file_title,
1979 'ext': file_extension.decode('utf-8'),
1983 class FacebookIE(InfoExtractor):
1984 """Information Extractor for Facebook"""
1987 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1988 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1989 _NETRC_MACHINE = 'facebook'
1990 _available_formats = ['video', 'highqual', 'lowqual']
1991 _video_extensions = {
1996 IE_NAME = u'facebook'
1998 def __init__(self, downloader=None):
1999 InfoExtractor.__init__(self, downloader)
2001 def _reporter(self, message):
2002 """Add header and report message."""
2003 self._downloader.to_screen(u'[facebook] %s' % message)
2005 def report_login(self):
2006 """Report attempt to log in."""
2007 self._reporter(u'Logging in')
2009 def report_video_webpage_download(self, video_id):
2010 """Report attempt to download video webpage."""
2011 self._reporter(u'%s: Downloading video webpage' % video_id)
2013 def report_information_extraction(self, video_id):
2014 """Report attempt to extract video information."""
2015 self._reporter(u'%s: Extracting video information' % video_id)
2017 def _parse_page(self, video_webpage):
2018 """Extract video information from page"""
2020 data = {'title': r'\("video_title", "(.*?)"\)',
2021 'description': r'<div class="datawrap">(.*?)</div>',
2022 'owner': r'\("video_owner_name", "(.*?)"\)',
2023 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2026 for piece in data.keys():
2027 mobj = re.search(data[piece], video_webpage)
2028 if mobj is not None:
2029 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2033 for fmt in self._available_formats:
2034 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2035 if mobj is not None:
2036 # URL is in a Javascript segment inside an escaped Unicode format within
2037 # the generally utf-8 page
2038 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2039 video_info['video_urls'] = video_urls
2043 def _real_initialize(self):
2044 if self._downloader is None:
2049 downloader_params = self._downloader.params
2051 # Attempt to use provided username and password or .netrc data
2052 if downloader_params.get('username', None) is not None:
2053 useremail = downloader_params['username']
2054 password = downloader_params['password']
2055 elif downloader_params.get('usenetrc', False):
2057 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2058 if info is not None:
2062 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2063 except (IOError, netrc.NetrcParseError) as err:
2064 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2067 if useremail is None:
2076 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2079 login_results = compat_urllib_request.urlopen(request).read()
2080 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2081 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2083 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2084 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2087 def _real_extract(self, url):
2088 mobj = re.match(self._VALID_URL, url)
2090 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2092 video_id = mobj.group('ID')
2095 self.report_video_webpage_download(video_id)
2096 request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2098 page = compat_urllib_request.urlopen(request)
2099 video_webpage = page.read()
2100 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2101 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2104 # Start extracting information
2105 self.report_information_extraction(video_id)
2107 # Extract information
2108 video_info = self._parse_page(video_webpage)
2111 if 'owner' not in video_info:
2112 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2114 video_uploader = video_info['owner']
2117 if 'title' not in video_info:
2118 self._downloader.trouble(u'ERROR: unable to extract video title')
2120 video_title = video_info['title']
2121 video_title = video_title.decode('utf-8')
2124 if 'thumbnail' not in video_info:
2125 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2126 video_thumbnail = ''
2128 video_thumbnail = video_info['thumbnail']
2132 if 'upload_date' in video_info:
2133 upload_time = video_info['upload_date']
2134 timetuple = email.utils.parsedate_tz(upload_time)
2135 if timetuple is not None:
2137 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2142 video_description = video_info.get('description', 'No description available.')
2144 url_map = video_info['video_urls']
2146 # Decide which formats to download
2147 req_format = self._downloader.params.get('format', None)
2148 format_limit = self._downloader.params.get('format_limit', None)
2150 if format_limit is not None and format_limit in self._available_formats:
2151 format_list = self._available_formats[self._available_formats.index(format_limit):]
2153 format_list = self._available_formats
2154 existing_formats = [x for x in format_list if x in url_map]
2155 if len(existing_formats) == 0:
2156 self._downloader.trouble(u'ERROR: no known formats available for video')
2158 if req_format is None:
2159 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2160 elif req_format == 'worst':
2161 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2162 elif req_format == '-1':
2163 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2166 if req_format not in url_map:
2167 self._downloader.trouble(u'ERROR: requested format not available')
2169 video_url_list = [(req_format, url_map[req_format])] # Specific format
2172 for format_param, video_real_url in video_url_list:
2174 video_extension = self._video_extensions.get(format_param, 'mp4')
2177 'id': video_id.decode('utf-8'),
2178 'url': video_real_url.decode('utf-8'),
2179 'uploader': video_uploader.decode('utf-8'),
2180 'upload_date': upload_date,
2181 'title': video_title,
2182 'ext': video_extension.decode('utf-8'),
2183 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2184 'thumbnail': video_thumbnail.decode('utf-8'),
2185 'description': video_description.decode('utf-8'),
2189 class BlipTVIE(InfoExtractor):
2190 """Information extractor for blip.tv"""
2192 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2193 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2194 IE_NAME = u'blip.tv'
2196 def report_extraction(self, file_id):
2197 """Report information extraction."""
2198 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2200 def report_direct_download(self, title):
2201 """Report information extraction."""
2202 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2204 def _real_extract(self, url):
2205 mobj = re.match(self._VALID_URL, url)
2207 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2214 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2215 request = compat_urllib_request.Request(json_url)
2216 self.report_extraction(mobj.group(1))
2219 urlh = compat_urllib_request.urlopen(request)
2220 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2221 basename = url.split('/')[-1]
2222 title,ext = os.path.splitext(basename)
2223 title = title.decode('UTF-8')
2224 ext = ext.replace('.', '')
2225 self.report_direct_download(title)
2230 'upload_date': None,
2235 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2236 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2238 if info is None: # Regular URL
2240 json_code_bytes = urlh.read()
2241 json_code = json_code_bytes.decode('utf-8')
2242 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2243 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2247 json_data = json.loads(json_code)
2248 if 'Post' in json_data:
2249 data = json_data['Post']
2253 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2254 video_url = data['media']['url']
2255 umobj = re.match(self._URL_EXT, video_url)
2257 raise ValueError('Can not determine filename extension')
2258 ext = umobj.group(1)
2261 'id': data['item_id'],
2263 'uploader': data['display_name'],
2264 'upload_date': upload_date,
2265 'title': data['title'],
2267 'format': data['media']['mimeType'],
2268 'thumbnail': data['thumbnailUrl'],
2269 'description': data['description'],
2270 'player_url': data['embedUrl']
2272 except (ValueError,KeyError) as err:
2273 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2276 std_headers['User-Agent'] = 'iTunes/10.6.1'
2280 class MyVideoIE(InfoExtractor):
2281 """Information Extractor for myvideo.de."""
2283 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2284 IE_NAME = u'myvideo'
2286 def __init__(self, downloader=None):
2287 InfoExtractor.__init__(self, downloader)
2289 def report_extraction(self, video_id):
2290 """Report information extraction."""
2291 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2293 def _real_extract(self,url):
2294 mobj = re.match(self._VALID_URL, url)
2296 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2299 video_id = mobj.group(1)
2302 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2303 webpage = self._download_webpage(webpage_url, video_id)
2305 self.report_extraction(video_id)
2306 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2309 self._downloader.trouble(u'ERROR: unable to extract media URL')
2311 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2313 mobj = re.search('<title>([^<]+)</title>', webpage)
2315 self._downloader.trouble(u'ERROR: unable to extract title')
2318 video_title = mobj.group(1)
2324 'upload_date': None,
2325 'title': video_title,
2329 class ComedyCentralIE(InfoExtractor):
2330 """Information extractor for The Daily Show and Colbert Report """
2332 # urls can be abbreviations like :thedailyshow or :colbert
2333 # urls for episodes like:
2334 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2335 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2336 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2337 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2338 |(https?://)?(www\.)?
2339 (?P<showname>thedailyshow|colbertnation)\.com/
2340 (full-episodes/(?P<episode>.*)|
2342 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2343 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2345 IE_NAME = u'comedycentral'
2347 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2349 _video_extensions = {
2357 _video_dimensions = {
2366 def suitable(self, url):
2367 """Receives a URL and returns True if suitable for this IE."""
2368 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2370 def report_extraction(self, episode_id):
2371 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2373 def report_config_download(self, episode_id):
2374 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2376 def report_index_download(self, episode_id):
2377 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2379 def report_player_url(self, episode_id):
2380 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2383 def _print_formats(self, formats):
2384 print('Available formats:')
2386 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2389 def _real_extract(self, url):
2390 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2392 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2395 if mobj.group('shortname'):
2396 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2397 url = u'http://www.thedailyshow.com/full-episodes/'
2399 url = u'http://www.colbertnation.com/full-episodes/'
2400 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2401 assert mobj is not None
2403 if mobj.group('clip'):
2404 if mobj.group('showname') == 'thedailyshow':
2405 epTitle = mobj.group('tdstitle')
2407 epTitle = mobj.group('cntitle')
2410 dlNewest = not mobj.group('episode')
2412 epTitle = mobj.group('showname')
2414 epTitle = mobj.group('episode')
2416 req = compat_urllib_request.Request(url)
2417 self.report_extraction(epTitle)
2419 htmlHandle = compat_urllib_request.urlopen(req)
2420 html = htmlHandle.read()
2421 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2422 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2425 url = htmlHandle.geturl()
2426 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2428 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2430 if mobj.group('episode') == '':
2431 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2433 epTitle = mobj.group('episode')
2435 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
2437 if len(mMovieParams) == 0:
2438 # The Colbert Report embeds the information in a without
2439 # a URL prefix; so extract the alternate reference
2440 # and then add the URL prefix manually.
2442 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
2443 if len(altMovieParams) == 0:
2444 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2447 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2449 playerUrl_raw = mMovieParams[0][0]
2450 self.report_player_url(epTitle)
2452 urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2453 playerUrl = urlHandle.geturl()
2454 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2455 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2458 uri = mMovieParams[0][1]
2459 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2460 self.report_index_download(epTitle)
2462 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2463 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2464 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2469 idoc = xml.etree.ElementTree.fromstring(indexXml)
2470 itemEls = idoc.findall('.//item')
2471 for itemEl in itemEls:
2472 mediaId = itemEl.findall('./guid')[0].text
2473 shortMediaId = mediaId.split(':')[-1]
2474 showId = mediaId.split(':')[-2].replace('.com', '')
2475 officialTitle = itemEl.findall('./title')[0].text
2476 officialDate = itemEl.findall('./pubDate')[0].text
2478 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2479 compat_urllib_parse.urlencode({'uri': mediaId}))
2480 configReq = compat_urllib_request.Request(configUrl)
2481 self.report_config_download(epTitle)
2483 configXml = compat_urllib_request.urlopen(configReq).read()
2484 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2485 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2488 cdoc = xml.etree.ElementTree.fromstring(configXml)
2490 for rendition in cdoc.findall('.//rendition'):
2491 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2495 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2498 if self._downloader.params.get('listformats', None):
2499 self._print_formats([i[0] for i in turls])
2502 # For now, just pick the highest bitrate
2503 format,video_url = turls[-1]
2505 # Get the format arg from the arg stream
2506 req_format = self._downloader.params.get('format', None)
2508 # Select format if we can find one
2511 format, video_url = f, v
2514 # Patch to download from alternative CDN, which does not
2515 # break on current RTMPDump builds
2516 broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2517 better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2519 if video_url.startswith(broken_cdn):
2520 video_url = video_url.replace(broken_cdn, better_cdn)
2522 effTitle = showId + u'-' + epTitle
2527 'upload_date': officialDate,
2532 'description': officialTitle,
2533 'player_url': None #playerUrl
2536 results.append(info)
2541 class EscapistIE(InfoExtractor):
2542 """Information extractor for The Escapist """
2544 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2545 IE_NAME = u'escapist'
2547 def report_extraction(self, showName):
2548 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2550 def report_config_download(self, showName):
2551 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2553 def _real_extract(self, url):
2554 mobj = re.match(self._VALID_URL, url)
2556 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2558 showName = mobj.group('showname')
2559 videoId = mobj.group('episode')
2561 self.report_extraction(showName)
2563 webPage = compat_urllib_request.urlopen(url)
2564 webPageBytes = webPage.read()
2565 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2566 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2567 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2568 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2571 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2572 description = unescapeHTML(descMatch.group(1))
2573 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2574 imgUrl = unescapeHTML(imgMatch.group(1))
2575 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2576 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2577 configUrlMatch = re.search('config=(.*)$', playerUrl)
2578 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2580 self.report_config_download(showName)
2582 configJSON = compat_urllib_request.urlopen(configUrl)
2583 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2584 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2585 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2586 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2589 # Technically, it's JavaScript, not JSON
2590 configJSON = configJSON.replace("'", '"')
2593 config = json.loads(configJSON)
2594 except (ValueError,) as err:
2595 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2598 playlist = config['playlist']
2599 videoUrl = playlist[1]['url']
2604 'uploader': showName,
2605 'upload_date': None,
2608 'thumbnail': imgUrl,
2609 'description': description,
2610 'player_url': playerUrl,
2616 class CollegeHumorIE(InfoExtractor):
2617 """Information extractor for collegehumor.com"""
2620 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2621 IE_NAME = u'collegehumor'
2623 def report_manifest(self, video_id):
2624 """Report information extraction."""
2625 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2627 def report_extraction(self, video_id):
2628 """Report information extraction."""
2629 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2631 def _real_extract(self, url):
2632 mobj = re.match(self._VALID_URL, url)
2634 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2636 video_id = mobj.group('videoid')
2641 'upload_date': None,
2644 self.report_extraction(video_id)
2645 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2647 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2648 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2649 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2652 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2654 videoNode = mdoc.findall('./video')[0]
2655 info['description'] = videoNode.findall('./description')[0].text
2656 info['title'] = videoNode.findall('./caption')[0].text
2657 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2658 manifest_url = videoNode.findall('./file')[0].text
2660 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2663 manifest_url += '?hdcore=2.10.3'
2664 self.report_manifest(video_id)
2666 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2667 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2668 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2671 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2673 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2674 node_id = media_node.attrib['url']
2675 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2676 except IndexError as err:
2677 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2680 url_pr = compat_urllib_parse_urlparse(manifest_url)
2681 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2688 class XVideosIE(InfoExtractor):
2689 """Information extractor for xvideos.com"""
2691 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2692 IE_NAME = u'xvideos'
2694 def report_extraction(self, video_id):
2695 """Report information extraction."""
2696 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2698 def _real_extract(self, url):
2699 mobj = re.match(self._VALID_URL, url)
2701 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2703 video_id = mobj.group(1)
2705 webpage = self._download_webpage(url, video_id)
2707 self.report_extraction(video_id)
2711 mobj = re.search(r'flv_url=(.+?)&', webpage)
2713 self._downloader.trouble(u'ERROR: unable to extract video url')
2715 video_url = compat_urllib_parse.unquote(mobj.group(1))
2719 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2721 self._downloader.trouble(u'ERROR: unable to extract video title')
2723 video_title = mobj.group(1)
2726 # Extract video thumbnail
2727 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2729 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2731 video_thumbnail = mobj.group(0)
2737 'upload_date': None,
2738 'title': video_title,
2740 'thumbnail': video_thumbnail,
2741 'description': None,
2747 class SoundcloudIE(InfoExtractor):
2748 """Information extractor for soundcloud.com
2749 To access the media, the uid of the song and a stream token
2750 must be extracted from the page source and the script must make
2751 a request to media.soundcloud.com/crossdomain.xml. Then
2752 the media can be grabbed by requesting from an url composed
2753 of the stream token and uid
2756 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2757 IE_NAME = u'soundcloud'
2759 def __init__(self, downloader=None):
2760 InfoExtractor.__init__(self, downloader)
2762 def report_resolve(self, video_id):
2763 """Report information extraction."""
2764 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2766 def report_extraction(self, video_id):
2767 """Report information extraction."""
2768 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2770 def _real_extract(self, url):
2771 mobj = re.match(self._VALID_URL, url)
2773 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2776 # extract uploader (which is in the url)
2777 uploader = mobj.group(1)
2778 # extract simple title (uploader + slug of song title)
2779 slug_title = mobj.group(2)
2780 simple_title = uploader + u'-' + slug_title
2782 self.report_resolve('%s/%s' % (uploader, slug_title))
2784 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2785 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2786 request = compat_urllib_request.Request(resolv_url)
2788 info_json_bytes = compat_urllib_request.urlopen(request).read()
2789 info_json = info_json_bytes.decode('utf-8')
2790 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2791 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2794 info = json.loads(info_json)
2795 video_id = info['id']
2796 self.report_extraction('%s/%s' % (uploader, slug_title))
2798 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2799 request = compat_urllib_request.Request(streams_url)
2801 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2802 stream_json = stream_json_bytes.decode('utf-8')
2803 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2804 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2807 streams = json.loads(stream_json)
2808 mediaURL = streams['http_mp3_128_url']
2813 'uploader': info['user']['username'],
2814 'upload_date': info['created_at'],
2815 'title': info['title'],
2817 'description': info['description'],
2821 class InfoQIE(InfoExtractor):
2822 """Information extractor for infoq.com"""
2824 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2827 def report_webpage(self, video_id):
2828 """Report information extraction."""
2829 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2831 def report_extraction(self, video_id):
2832 """Report information extraction."""
2833 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2835 def _real_extract(self, url):
2836 mobj = re.match(self._VALID_URL, url)
2838 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2841 self.report_webpage(url)
2843 request = compat_urllib_request.Request(url)
2845 webpage = compat_urllib_request.urlopen(request).read()
2846 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2847 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2850 self.report_extraction(url)
2854 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2856 self._downloader.trouble(u'ERROR: unable to extract video url')
2858 video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2862 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2864 self._downloader.trouble(u'ERROR: unable to extract video title')
2866 video_title = mobj.group(1).decode('utf-8')
2868 # Extract description
2869 video_description = u'No description available.'
2870 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2871 if mobj is not None:
2872 video_description = mobj.group(1).decode('utf-8')
2874 video_filename = video_url.split('/')[-1]
2875 video_id, extension = video_filename.split('.')
2881 'upload_date': None,
2882 'title': video_title,
2883 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2885 'description': video_description,
2890 class MixcloudIE(InfoExtractor):
2891 """Information extractor for www.mixcloud.com"""
2893 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2894 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2895 IE_NAME = u'mixcloud'
2897 def __init__(self, downloader=None):
2898 InfoExtractor.__init__(self, downloader)
2900 def report_download_json(self, file_id):
2901 """Report JSON download."""
2902 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2904 def report_extraction(self, file_id):
2905 """Report information extraction."""
2906 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2908 def get_urls(self, jsonData, fmt, bitrate='best'):
2909 """Get urls from 'audio_formats' section in json"""
2912 bitrate_list = jsonData[fmt]
2913 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2914 bitrate = max(bitrate_list) # select highest
2916 url_list = jsonData[fmt][bitrate]
2917 except TypeError: # we have no bitrate info.
2918 url_list = jsonData[fmt]
2921 def check_urls(self, url_list):
2922 """Returns 1st active url from list"""
2923 for url in url_list:
2925 compat_urllib_request.urlopen(url)
2927 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2932 def _print_formats(self, formats):
2933 print('Available formats:')
2934 for fmt in formats.keys():
2935 for b in formats[fmt]:
2937 ext = formats[fmt][b][0]
2938 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2939 except TypeError: # we have no bitrate info
2940 ext = formats[fmt][0]
2941 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2944 def _real_extract(self, url):
2945 mobj = re.match(self._VALID_URL, url)
2947 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2949 # extract uploader & filename from url
2950 uploader = mobj.group(1).decode('utf-8')
2951 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2953 # construct API request
2954 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2955 # retrieve .json file with links to files
2956 request = compat_urllib_request.Request(file_url)
2958 self.report_download_json(file_url)
2959 jsonData = compat_urllib_request.urlopen(request).read()
2960 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2961 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2965 json_data = json.loads(jsonData)
2966 player_url = json_data['player_swf_url']
2967 formats = dict(json_data['audio_formats'])
2969 req_format = self._downloader.params.get('format', None)
2972 if self._downloader.params.get('listformats', None):
2973 self._print_formats(formats)
2976 if req_format is None or req_format == 'best':
2977 for format_param in formats.keys():
2978 url_list = self.get_urls(formats, format_param)
2980 file_url = self.check_urls(url_list)
2981 if file_url is not None:
2984 if req_format not in formats:
2985 self._downloader.trouble(u'ERROR: format is not available')
2988 url_list = self.get_urls(formats, req_format)
2989 file_url = self.check_urls(url_list)
2990 format_param = req_format
2993 'id': file_id.decode('utf-8'),
2994 'url': file_url.decode('utf-8'),
2995 'uploader': uploader.decode('utf-8'),
2996 'upload_date': None,
2997 'title': json_data['name'],
2998 'ext': file_url.split('.')[-1].decode('utf-8'),
2999 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3000 'thumbnail': json_data['thumbnail_url'],
3001 'description': json_data['description'],
3002 'player_url': player_url.decode('utf-8'),
3005 class StanfordOpenClassroomIE(InfoExtractor):
3006 """Information extractor for Stanford's Open ClassRoom"""
3008 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3009 IE_NAME = u'stanfordoc'
3011 def report_download_webpage(self, objid):
3012 """Report information extraction."""
3013 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3015 def report_extraction(self, video_id):
3016 """Report information extraction."""
3017 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3019 def _real_extract(self, url):
3020 mobj = re.match(self._VALID_URL, url)
3022 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3025 if mobj.group('course') and mobj.group('video'): # A specific video
3026 course = mobj.group('course')
3027 video = mobj.group('video')
3029 'id': course + '_' + video,
3031 'upload_date': None,
3034 self.report_extraction(info['id'])
3035 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3036 xmlUrl = baseUrl + video + '.xml'
3038 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3039 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3040 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3042 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3044 info['title'] = mdoc.findall('./title')[0].text
3045 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3047 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3049 info['ext'] = info['url'].rpartition('.')[2]
3051 elif mobj.group('course'): # A course page
3052 course = mobj.group('course')
3057 'upload_date': None,
3060 self.report_download_webpage(info['id'])
3062 coursepage = compat_urllib_request.urlopen(url).read()
3063 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3064 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3067 m = re.search('<h1>([^<]+)</h1>', coursepage)
3069 info['title'] = unescapeHTML(m.group(1))
3071 info['title'] = info['id']
3073 m = re.search('<description>([^<]+)</description>', coursepage)
3075 info['description'] = unescapeHTML(m.group(1))
3077 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3080 'type': 'reference',
3081 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3085 for entry in info['list']:
3086 assert entry['type'] == 'reference'
3087 results += self.extract(entry['url'])
3092 'id': 'Stanford OpenClassroom',
3095 'upload_date': None,
3098 self.report_download_webpage(info['id'])
3099 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3101 rootpage = compat_urllib_request.urlopen(rootURL).read()
3102 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3103 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3106 info['title'] = info['id']
3108 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3111 'type': 'reference',
3112 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3117 for entry in info['list']:
3118 assert entry['type'] == 'reference'
3119 results += self.extract(entry['url'])
3122 class MTVIE(InfoExtractor):
3123 """Information extractor for MTV.com"""
3125 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3128 def report_extraction(self, video_id):
3129 """Report information extraction."""
3130 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3132 def _real_extract(self, url):
3133 mobj = re.match(self._VALID_URL, url)
3135 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3137 if not mobj.group('proto'):
3138 url = 'http://' + url
3139 video_id = mobj.group('videoid')
3141 webpage = self._download_webpage(url, video_id)
3143 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3145 self._downloader.trouble(u'ERROR: unable to extract song name')
3147 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3148 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3150 self._downloader.trouble(u'ERROR: unable to extract performer')
3152 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3153 video_title = performer + ' - ' + song_name
3155 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3157 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3159 mtvn_uri = mobj.group(1)
3161 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3163 self._downloader.trouble(u'ERROR: unable to extract content id')
3165 content_id = mobj.group(1)
3167 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3168 self.report_extraction(video_id)
3169 request = compat_urllib_request.Request(videogen_url)
3171 metadataXml = compat_urllib_request.urlopen(request).read()
3172 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3173 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3176 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3177 renditions = mdoc.findall('.//rendition')
3179 # For now, always pick the highest quality.
3180 rendition = renditions[-1]
3183 _,_,ext = rendition.attrib['type'].partition('/')
3184 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3185 video_url = rendition.find('./src').text
3187 self._downloader.trouble('Invalid rendition field.')
3193 'uploader': performer,
3194 'upload_date': None,
3195 'title': video_title,
3203 class YoukuIE(InfoExtractor):
3204 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3206 def report_download_webpage(self, file_id):
3207 """Report webpage download."""
3208 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3210 def report_extraction(self, file_id):
3211 """Report information extraction."""
3212 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3215 nowTime = int(time.time() * 1000)
3216 random1 = random.randint(1000,1998)
3217 random2 = random.randint(1000,9999)
3219 return "%d%d%d" %(nowTime,random1,random2)
3221 def _get_file_ID_mix_string(self, seed):
3223 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3225 for i in range(len(source)):
3226 seed = (seed * 211 + 30031 ) % 65536
3227 index = math.floor(seed / 65536 * len(source) )
3228 mixed.append(source[int(index)])
3229 source.remove(source[int(index)])
3230 #return ''.join(mixed)
3233 def _get_file_id(self, fileId, seed):
3234 mixed = self._get_file_ID_mix_string(seed)
3235 ids = fileId.split('*')
3239 realId.append(mixed[int(ch)])
3240 return ''.join(realId)
3242 def _real_extract(self, url):
3243 mobj = re.match(self._VALID_URL, url)
3245 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3247 video_id = mobj.group('ID')
3249 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3251 request = compat_urllib_request.Request(info_url, None, std_headers)
3253 self.report_download_webpage(video_id)
3254 jsondata = compat_urllib_request.urlopen(request).read()
3255 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3256 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3259 self.report_extraction(video_id)
3261 jsonstr = jsondata.decode('utf-8')
3262 config = json.loads(jsonstr)
3264 video_title = config['data'][0]['title']
3265 seed = config['data'][0]['seed']
3267 format = self._downloader.params.get('format', None)
3268 supported_format = list(config['data'][0]['streamfileids'].keys())
3270 if format is None or format == 'best':
3271 if 'hd2' in supported_format:
3276 elif format == 'worst':
3284 fileid = config['data'][0]['streamfileids'][format]
3285 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3286 except (UnicodeDecodeError, ValueError, KeyError):
3287 self._downloader.trouble(u'ERROR: unable to extract info section')
3291 sid = self._gen_sid()
3292 fileid = self._get_file_id(fileid, seed)
3294 #column 8,9 of fileid represent the segment number
3295 #fileid[7:9] should be changed
3296 for index, key in enumerate(keys):
3298 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3299 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3302 'id': '%s_part%02d' % (video_id, index),
3303 'url': download_url,
3305 'upload_date': None,
3306 'title': video_title,
3309 files_info.append(info)
3314 class XNXXIE(InfoExtractor):
3315 """Information extractor for xnxx.com"""
3317 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3319 VIDEO_URL_RE = r'flv_url=(.*?)&'
3320 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3321 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3323 def report_webpage(self, video_id):
3324 """Report information extraction"""
3325 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3327 def report_extraction(self, video_id):
3328 """Report information extraction"""
3329 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3331 def _real_extract(self, url):
3332 mobj = re.match(self._VALID_URL, url)
3334 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3336 video_id = mobj.group(1)
3338 self.report_webpage(video_id)
3340 # Get webpage content
3342 webpage_bytes = compat_urllib_request.urlopen(url).read()
3343 webpage = webpage_bytes.decode('utf-8')
3344 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3345 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3348 result = re.search(self.VIDEO_URL_RE, webpage)
3350 self._downloader.trouble(u'ERROR: unable to extract video url')
3352 video_url = compat_urllib_parse.unquote(result.group(1))
3354 result = re.search(self.VIDEO_TITLE_RE, webpage)
3356 self._downloader.trouble(u'ERROR: unable to extract video title')
3358 video_title = result.group(1)
3360 result = re.search(self.VIDEO_THUMB_RE, webpage)
3362 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3364 video_thumbnail = result.group(1)
3370 'upload_date': None,
3371 'title': video_title,
3373 'thumbnail': video_thumbnail,
3374 'description': None,
3378 class GooglePlusIE(InfoExtractor):
3379 """Information extractor for plus.google.com."""
3381 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3382 IE_NAME = u'plus.google'
3384 def __init__(self, downloader=None):
3385 InfoExtractor.__init__(self, downloader)
3387 def report_extract_entry(self, url):
3388 """Report downloading extry"""
3389 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3391 def report_date(self, upload_date):
3392 """Report downloading extry"""
3393 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3395 def report_uploader(self, uploader):
3396 """Report downloading extry"""
3397 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3399 def report_title(self, video_title):
3400 """Report downloading extry"""
3401 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3403 def report_extract_vid_page(self, video_page):
3404 """Report information extraction."""
3405 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3407 def _real_extract(self, url):
3408 # Extract id from URL
3409 mobj = re.match(self._VALID_URL, url)
3411 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3414 post_url = mobj.group(0)
3415 video_id = mobj.group(1)
3417 video_extension = 'flv'
3419 # Step 1, Retrieve post webpage to extract further information
3420 self.report_extract_entry(post_url)
3421 request = compat_urllib_request.Request(post_url)
3423 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3424 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3425 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3428 # Extract update date
3430 pattern = 'title="Timestamp">(.*?)</a>'
3431 mobj = re.search(pattern, webpage)
3433 upload_date = mobj.group(1)
3434 # Convert timestring to a format suitable for filename
3435 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3436 upload_date = upload_date.strftime('%Y%m%d')
3437 self.report_date(upload_date)
3441 pattern = r'rel\="author".*?>(.*?)</a>'
3442 mobj = re.search(pattern, webpage)
3444 uploader = mobj.group(1)
3445 self.report_uploader(uploader)
3448 # Get the first line for title
3450 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3451 mobj = re.search(pattern, webpage)
3453 video_title = mobj.group(1)
3454 self.report_title(video_title)
3456 # Step 2, Stimulate clicking the image box to launch video
3457 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3458 mobj = re.search(pattern, webpage)
3460 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3462 video_page = mobj.group(1)
3463 request = compat_urllib_request.Request(video_page)
3465 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3466 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3467 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3469 self.report_extract_vid_page(video_page)
3472 # Extract video links on video page
3473 """Extract video links of all sizes"""
3474 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3475 mobj = re.findall(pattern, webpage)
3477 self._downloader.trouble(u'ERROR: unable to extract video links')
3479 # Sort in resolution
3480 links = sorted(mobj)
3482 # Choose the lowest of the sort, i.e. highest resolution
3483 video_url = links[-1]
3484 # Only get the url. The resolution part in the tuple has no use anymore
3485 video_url = video_url[-1]
3486 # Treat escaped \u0026 style hex
3488 video_url = video_url.decode("unicode_escape")
3489 except AttributeError: # Python 3
3490 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3496 'uploader': uploader,
3497 'upload_date': upload_date,
3498 'title': video_title,
3499 'ext': video_extension,
3502 class NBAIE(InfoExtractor):
3503 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3506 def _real_extract(self, url):
3507 mobj = re.match(self._VALID_URL, url)
3509 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3512 video_id = mobj.group(1)
3513 if video_id.endswith('/index.html'):
3514 video_id = video_id[:-len('/index.html')]
3516 webpage = self._download_webpage(url, video_id)
3518 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3519 def _findProp(rexp, default=None):
3520 m = re.search(rexp, webpage)
3522 return unescapeHTML(m.group(1))
3526 shortened_video_id = video_id.rpartition('/')[2]
3527 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3529 'id': shortened_video_id,
3533 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3534 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3538 class JustinTVIE(InfoExtractor):
3539 """Information extractor for justin.tv and twitch.tv"""
3540 # TODO: One broadcast may be split into multiple videos. The key
3541 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3542 # starts at 1 and increases. Can we treat all parts as one video?
3544 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3545 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3546 _JUSTIN_PAGE_LIMIT = 100
3547 IE_NAME = u'justin.tv'
3549 def report_extraction(self, file_id):
3550 """Report information extraction."""
3551 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3553 def report_download_page(self, channel, offset):
3554 """Report attempt to download a single page of videos."""
3555 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3556 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3558 # Return count of items, list of *valid* items
3559 def _parse_page(self, url):
3561 urlh = compat_urllib_request.urlopen(url)
3562 webpage_bytes = urlh.read()
3563 webpage = webpage_bytes.decode('utf-8', 'ignore')
3564 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3565 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3568 response = json.loads(webpage)
3570 for clip in response:
3571 video_url = clip['video_file_url']
3573 video_extension = os.path.splitext(video_url)[1][1:]
3574 video_date = re.sub('-', '', clip['created_on'][:10])
3578 'title': clip['title'],
3579 'uploader': clip.get('user_id', clip.get('channel_id')),
3580 'upload_date': video_date,
3581 'ext': video_extension,
3583 return (len(response), info)
3585 def _real_extract(self, url):
3586 mobj = re.match(self._VALID_URL, url)
3588 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3591 api = 'http://api.justin.tv'
3592 video_id = mobj.group(mobj.lastindex)
3594 if mobj.lastindex == 1:
3596 api += '/channel/archives/%s.json'
3598 api += '/clip/show/%s.json'
3599 api = api % (video_id,)
3601 self.report_extraction(video_id)
3605 limit = self._JUSTIN_PAGE_LIMIT
3608 self.report_download_page(video_id, offset)
3609 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3610 page_count, page_info = self._parse_page(page_url)
3611 info.extend(page_info)
3612 if not paged or page_count != limit:
3617 class FunnyOrDieIE(InfoExtractor):
3618 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3620 def _real_extract(self, url):
3621 mobj = re.match(self._VALID_URL, url)
3623 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3626 video_id = mobj.group('id')
3627 webpage = self._download_webpage(url, video_id)
3629 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3631 self._downloader.trouble(u'ERROR: unable to find video information')
3632 video_url = unescapeHTML(m.group('url'))
3634 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3636 self._downloader.trouble(u'Cannot find video title')
3637 title = unescapeHTML(m.group('title'))
3639 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3641 desc = unescapeHTML(m.group('desc'))
3650 'description': desc,
3654 class TweetReelIE(InfoExtractor):
3655 _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3657 def _real_extract(self, url):
3658 mobj = re.match(self._VALID_URL, url)
3660 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3663 video_id = mobj.group('id')
3664 webpage = self._download_webpage(url, video_id)
3666 m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3668 self._downloader.trouble(u'ERROR: Cannot find status ID')
3669 status_id = m.group(1)
3671 m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3673 self._downloader.trouble(u'WARNING: Cannot find description')
3674 desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3676 m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3678 self._downloader.trouble(u'ERROR: Cannot find uploader')
3679 uploader = unescapeHTML(m.group('uploader'))
3680 uploader_id = unescapeHTML(m.group('uploader_id'))
3682 m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3684 self._downloader.trouble(u'ERROR: Cannot find upload date')
3685 upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3688 video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3695 'description': desc,
3696 'uploader': uploader,
3697 'uploader_id': uploader_id,
3698 'internal_id': status_id,
3699 'upload_date': upload_date
3703 class SteamIE(InfoExtractor):
3704 _VALID_URL = r"""http://store.steampowered.com/
3705 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3707 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3710 def suitable(self, url):
3711 """Receives a URL and returns True if suitable for this IE."""
3712 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3714 def _real_extract(self, url):
3715 m = re.match(self._VALID_URL, url, re.VERBOSE)
3716 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3717 gameID = m.group('gameID')
3718 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3719 webpage = self._download_webpage(videourl, gameID)
3720 mweb = re.finditer(urlRE, webpage)
3721 namesRE = r'<span class=\"title\">(?P<videoName>[\w:/\.\?=\+\s-]+)</span>'
3722 titles = list(re.finditer(namesRE, webpage))
3724 for vid,vtitle in zip(mweb,titles):
3725 video_id = vid.group('videoID')
3726 title = vtitle.group('videoName')
3727 video_url = vid.group('videoURL')
3729 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3739 class UstreamIE(InfoExtractor):
3740 _VALID_URL = r'http://www.ustream.tv/recorded/(?P<videoID>\d+)'
3741 IE_NAME = u'ustream'
3743 def _real_extract(self, url):
3744 m = re.match(self._VALID_URL, url)
3745 video_id = m.group('videoID')
3746 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3747 webpage = self._download_webpage(url, video_id)
3748 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3749 title = m.group('title')
3750 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3751 uploader = m.group('uploader')
3757 'uploader': uploader
3762 def gen_extractors():
3763 """ Return a list of an instance of every supported extractor.
3764 The order does matter; the first extractor matched is the one handling the URL.
3767 YoutubePlaylistIE(),
3791 StanfordOpenClassroomIE(),