2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
13 import xml.etree.ElementTree
20 class InfoExtractor(object):
21 """Information Extractor class.
23 Information extractors are the classes that, given a URL, extract
24 information about the video (or videos) the URL refers to. This
25 information includes the real video URL, the video title, author and
26 others. The information is stored in a dictionary which is then
27 passed to the FileDownloader. The FileDownloader processes this
28 information possibly downloading the video to the file system, among
29 other possible outcomes.
31 The dictionaries must include the following fields:
35 title: Video title, unescaped.
36 ext: Video filename extension.
37 uploader: Full name of the video uploader.
38 upload_date: Video upload date (YYYYMMDD).
40 The following fields are optional:
42 format: The video format, defaults to ext (used for --get-format)
43 thumbnail: Full URL to a video thumbnail image.
44 description: One-line video description.
45 uploader_id: Nickname or id of the video uploader.
46 player_url: SWF Player URL (used for rtmpdump).
47 subtitles: The .srt file contents.
48 urlhandle: [internal] The urlHandle to be used to download the file,
49 like returned by urllib.request.urlopen
51 The fields should all be Unicode strings.
53 Subclasses of this one should re-define the _real_initialize() and
54 _real_extract() methods and define a _VALID_URL regexp.
55 Probably, they should also be added to the list of extractors.
57 _real_extract() must return a *list* of information dictionaries as
60 Finally, the _WORKING attribute should be set to False for broken IEs
61 in order to warn the users and skip the tests.
68 def __init__(self, downloader=None):
69 """Constructor. Receives an optional downloader."""
71 self.set_downloader(downloader)
73 def suitable(self, url):
74 """Receives a URL and returns True if suitable for this IE."""
75 return re.match(self._VALID_URL, url) is not None
78 """Getter method for _WORKING."""
82 """Initializes an instance (authentication, etc)."""
84 self._real_initialize()
87 def extract(self, url):
88 """Extracts URL information and returns it in list of dicts."""
90 return self._real_extract(url)
92 def set_downloader(self, downloader):
93 """Sets the downloader for this IE."""
94 self._downloader = downloader
96 def _real_initialize(self):
97 """Real initialization process. Redefine in subclasses."""
100 def _real_extract(self, url):
101 """Real extraction process. Redefine in subclasses."""
106 return type(self).__name__[:-2]
108 def _download_webpage(self, url, video_id, note=None, errnote=None):
110 note = u'Downloading video webpage'
111 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
113 urlh = compat_urllib_request.urlopen(url)
114 webpage_bytes = urlh.read()
115 return webpage_bytes.decode('utf-8', 'replace')
116 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
118 errnote = u'Unable to download webpage'
119 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)))
122 class YoutubeIE(InfoExtractor):
123 """Information extractor for youtube.com."""
127 (?:https?://)? # http(s):// (optional)
128 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
129 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
130 (?:.*?\#/)? # handle anchor (#/) redirect urls
131 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
132 (?: # the various things that can precede the ID:
133 (?:(?:v|embed|e)/) # v/ or embed/ or e/
134 |(?: # or the v= param in all its forms
135 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
136 (?:\?|\#!?) # the params delimiter ? or # or #!
137 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
140 )? # optional -> youtube.com/xxxx is OK
141 )? # all until now is optional -> you can pass the naked ID
142 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
143 (?(1).+)? # if we found the ID, everything can follow
145 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
146 _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
147 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
148 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
149 _NETRC_MACHINE = 'youtube'
150 # Listed in order of quality
151 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
152 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
153 _video_extensions = {
159 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
165 _video_dimensions = {
183 def suitable(self, url):
184 """Receives a URL and returns True if suitable for this IE."""
185 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
187 def report_lang(self):
188 """Report attempt to set language."""
189 self._downloader.to_screen(u'[youtube] Setting language')
191 def report_login(self):
192 """Report attempt to log in."""
193 self._downloader.to_screen(u'[youtube] Logging in')
195 def report_age_confirmation(self):
196 """Report attempt to confirm age."""
197 self._downloader.to_screen(u'[youtube] Confirming age')
199 def report_video_webpage_download(self, video_id):
200 """Report attempt to download video webpage."""
201 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
203 def report_video_info_webpage_download(self, video_id):
204 """Report attempt to download video info webpage."""
205 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
207 def report_video_subtitles_download(self, video_id):
208 """Report attempt to download video info webpage."""
209 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
211 def report_information_extraction(self, video_id):
212 """Report attempt to extract video information."""
213 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
215 def report_unavailable_format(self, video_id, format):
216 """Report extracted video URL."""
217 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
219 def report_rtmp_download(self):
220 """Indicate the download will use the RTMP protocol."""
221 self._downloader.to_screen(u'[youtube] RTMP download detected')
223 def _closed_captions_xml_to_srt(self, xml_string):
225 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
226 # TODO parse xml instead of regex
227 for n, (start, dur_tag, dur, caption) in enumerate(texts):
228 if not dur: dur = '4'
230 end = start + float(dur)
231 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
232 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
233 caption = unescapeHTML(caption)
234 caption = unescapeHTML(caption) # double cycle, intentional
235 srt += str(n+1) + '\n'
236 srt += start + ' --> ' + end + '\n'
237 srt += caption + '\n\n'
240 def _extract_subtitles(self, video_id):
241 self.report_video_subtitles_download(video_id)
242 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
244 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
245 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
246 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
247 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
248 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
249 if not srt_lang_list:
250 return (u'WARNING: video has no closed captions', None)
251 if self._downloader.params.get('subtitleslang', False):
252 srt_lang = self._downloader.params.get('subtitleslang')
253 elif 'en' in srt_lang_list:
256 srt_lang = list(srt_lang_list.keys())[0]
257 if not srt_lang in srt_lang_list:
258 return (u'WARNING: no closed captions found in the specified language', None)
259 request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
261 srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
262 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
263 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
265 return (u'WARNING: unable to download video subtitles', None)
266 return (None, self._closed_captions_xml_to_srt(srt_xml))
268 def _print_formats(self, formats):
269 print('Available formats:')
271 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
273 def _real_initialize(self):
274 if self._downloader is None:
279 downloader_params = self._downloader.params
281 # Attempt to use provided username and password or .netrc data
282 if downloader_params.get('username', None) is not None:
283 username = downloader_params['username']
284 password = downloader_params['password']
285 elif downloader_params.get('usenetrc', False):
287 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
292 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
293 except (IOError, netrc.NetrcParseError) as err:
294 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
298 request = compat_urllib_request.Request(self._LANG_URL)
301 compat_urllib_request.urlopen(request).read()
302 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
303 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
306 # No authentication to be performed
312 'current_form': 'loginForm',
314 'action_login': 'Log In',
315 'username': username,
316 'password': password,
318 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
321 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
322 if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
323 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
325 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
326 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
332 'action_confirm': 'Confirm',
334 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
336 self.report_age_confirmation()
337 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
338 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
339 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
342 def _extract_id(self, url):
343 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
345 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
347 video_id = mobj.group(2)
350 def _real_extract(self, url):
351 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
352 mobj = re.search(self._NEXT_URL_RE, url)
354 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
355 video_id = self._extract_id(url)
358 self.report_video_webpage_download(video_id)
359 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
360 request = compat_urllib_request.Request(url)
362 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
363 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
364 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
367 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
369 # Attempt to extract SWF player URL
370 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
372 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
377 self.report_video_info_webpage_download(video_id)
378 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
379 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
380 % (video_id, el_type))
381 request = compat_urllib_request.Request(video_info_url)
383 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
384 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
385 video_info = compat_parse_qs(video_info_webpage)
386 if 'token' in video_info:
388 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
389 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
391 if 'token' not in video_info:
392 if 'reason' in video_info:
393 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
395 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
398 # Check for "rental" videos
399 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
400 self._downloader.trouble(u'ERROR: "rental" videos not supported')
403 # Start extracting information
404 self.report_information_extraction(video_id)
407 if 'author' not in video_info:
408 self._downloader.trouble(u'ERROR: unable to extract uploader name')
410 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
413 video_uploader_id = None
414 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/user/([^"]+)">', video_webpage)
416 video_uploader_id = mobj.group(1)
418 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
421 if 'title' not in video_info:
422 self._downloader.trouble(u'ERROR: unable to extract video title')
424 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
427 if 'thumbnail_url' not in video_info:
428 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
430 else: # don't panic if we can't find it
431 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
435 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
437 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
438 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
439 for expression in format_expressions:
441 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
446 video_description = get_element_by_id("eow-description", video_webpage)
447 if video_description:
448 video_description = clean_html(video_description)
450 video_description = ''
453 video_subtitles = None
454 if self._downloader.params.get('writesubtitles', False):
455 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
457 self._downloader.trouble(srt_error)
459 if 'length_seconds' not in video_info:
460 self._downloader.trouble(u'WARNING: unable to extract video duration')
463 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
466 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
468 # Decide which formats to download
469 req_format = self._downloader.params.get('format', None)
471 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
472 self.report_rtmp_download()
473 video_url_list = [(None, video_info['conn'][0])]
474 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
475 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
476 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
477 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
478 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
480 format_limit = self._downloader.params.get('format_limit', None)
481 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
482 if format_limit is not None and format_limit in available_formats:
483 format_list = available_formats[available_formats.index(format_limit):]
485 format_list = available_formats
486 existing_formats = [x for x in format_list if x in url_map]
487 if len(existing_formats) == 0:
488 self._downloader.trouble(u'ERROR: no known formats available for video')
490 if self._downloader.params.get('listformats', None):
491 self._print_formats(existing_formats)
493 if req_format is None or req_format == 'best':
494 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
495 elif req_format == 'worst':
496 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
497 elif req_format in ('-1', 'all'):
498 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
500 # Specific formats. We pick the first in a slash-delimeted sequence.
501 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
502 req_formats = req_format.split('/')
503 video_url_list = None
504 for rf in req_formats:
506 video_url_list = [(rf, url_map[rf])]
508 if video_url_list is None:
509 self._downloader.trouble(u'ERROR: requested format not available')
512 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
516 for format_param, video_real_url in video_url_list:
518 video_extension = self._video_extensions.get(format_param, 'flv')
520 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
521 self._video_dimensions.get(format_param, '???'))
525 'url': video_real_url,
526 'uploader': video_uploader,
527 'uploader_id': video_uploader_id,
528 'upload_date': upload_date,
529 'title': video_title,
530 'ext': video_extension,
531 'format': video_format,
532 'thumbnail': video_thumbnail,
533 'description': video_description,
534 'player_url': player_url,
535 'subtitles': video_subtitles,
536 'duration': video_duration
541 class MetacafeIE(InfoExtractor):
542 """Information Extractor for metacafe.com."""
544 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
545 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
546 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
547 IE_NAME = u'metacafe'
549 def __init__(self, downloader=None):
550 InfoExtractor.__init__(self, downloader)
552 def report_disclaimer(self):
553 """Report disclaimer retrieval."""
554 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
556 def report_age_confirmation(self):
557 """Report attempt to confirm age."""
558 self._downloader.to_screen(u'[metacafe] Confirming age')
560 def report_download_webpage(self, video_id):
561 """Report webpage download."""
562 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
564 def report_extraction(self, video_id):
565 """Report information extraction."""
566 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
568 def _real_initialize(self):
569 # Retrieve disclaimer
570 request = compat_urllib_request.Request(self._DISCLAIMER)
572 self.report_disclaimer()
573 disclaimer = compat_urllib_request.urlopen(request).read()
574 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
575 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
581 'submit': "Continue - I'm over 18",
583 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
585 self.report_age_confirmation()
586 disclaimer = compat_urllib_request.urlopen(request).read()
587 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
588 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
591 def _real_extract(self, url):
592 # Extract id and simplified title from URL
593 mobj = re.match(self._VALID_URL, url)
595 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
598 video_id = mobj.group(1)
600 # Check if video comes from YouTube
601 mobj2 = re.match(r'^yt-(.*)$', video_id)
602 if mobj2 is not None:
603 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
606 # Retrieve video webpage to extract further information
607 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
609 self.report_download_webpage(video_id)
610 webpage = compat_urllib_request.urlopen(request).read()
611 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
612 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
615 # Extract URL, uploader and title from webpage
616 self.report_extraction(video_id)
617 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
619 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
620 video_extension = mediaURL[-3:]
622 # Extract gdaKey if available
623 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
627 gdaKey = mobj.group(1)
628 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
630 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
632 self._downloader.trouble(u'ERROR: unable to extract media URL')
634 vardict = compat_parse_qs(mobj.group(1))
635 if 'mediaData' not in vardict:
636 self._downloader.trouble(u'ERROR: unable to extract media URL')
638 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
640 self._downloader.trouble(u'ERROR: unable to extract media URL')
642 mediaURL = mobj.group(1).replace('\\/', '/')
643 video_extension = mediaURL[-3:]
644 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
646 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
648 self._downloader.trouble(u'ERROR: unable to extract title')
650 video_title = mobj.group(1).decode('utf-8')
652 mobj = re.search(r'submitter=(.*?);', webpage)
654 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
656 video_uploader = mobj.group(1)
659 'id': video_id.decode('utf-8'),
660 'url': video_url.decode('utf-8'),
661 'uploader': video_uploader.decode('utf-8'),
663 'title': video_title,
664 'ext': video_extension.decode('utf-8'),
668 class DailymotionIE(InfoExtractor):
669 """Information Extractor for Dailymotion"""
671 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
672 IE_NAME = u'dailymotion'
674 def __init__(self, downloader=None):
675 InfoExtractor.__init__(self, downloader)
677 def report_download_webpage(self, video_id):
678 """Report webpage download."""
679 self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
681 def report_extraction(self, video_id):
682 """Report information extraction."""
683 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
685 def _real_extract(self, url):
686 # Extract id and simplified title from URL
687 mobj = re.match(self._VALID_URL, url)
689 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
692 video_id = mobj.group(1).split('_')[0].split('?')[0]
694 video_extension = 'mp4'
696 # Retrieve video webpage to extract further information
697 request = compat_urllib_request.Request(url)
698 request.add_header('Cookie', 'family_filter=off')
700 self.report_download_webpage(video_id)
701 webpage_bytes = compat_urllib_request.urlopen(request).read()
702 webpage = webpage_bytes.decode('utf-8')
703 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
704 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
707 # Extract URL, uploader and title from webpage
708 self.report_extraction(video_id)
709 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
711 self._downloader.trouble(u'ERROR: unable to extract media URL')
713 flashvars = compat_urllib_parse.unquote(mobj.group(1))
715 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
718 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
721 self._downloader.trouble(u'ERROR: unable to extract video URL')
724 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
726 self._downloader.trouble(u'ERROR: unable to extract video URL')
729 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
731 # TODO: support choosing qualities
733 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
735 self._downloader.trouble(u'ERROR: unable to extract title')
737 video_title = unescapeHTML(mobj.group('title'))
739 video_uploader = None
740 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
742 # lookin for official user
743 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
744 if mobj_official is None:
745 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
747 video_uploader = mobj_official.group(1)
749 video_uploader = mobj.group(1)
751 video_upload_date = None
752 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
754 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
759 'uploader': video_uploader,
760 'upload_date': video_upload_date,
761 'title': video_title,
762 'ext': video_extension,
766 class PhotobucketIE(InfoExtractor):
767 """Information extractor for photobucket.com."""
769 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
770 IE_NAME = u'photobucket'
772 def __init__(self, downloader=None):
773 InfoExtractor.__init__(self, downloader)
775 def report_download_webpage(self, video_id):
776 """Report webpage download."""
777 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
779 def report_extraction(self, video_id):
780 """Report information extraction."""
781 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
783 def _real_extract(self, url):
784 # Extract id from URL
785 mobj = re.match(self._VALID_URL, url)
787 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
790 video_id = mobj.group(1)
792 video_extension = 'flv'
794 # Retrieve video webpage to extract further information
795 request = compat_urllib_request.Request(url)
797 self.report_download_webpage(video_id)
798 webpage = compat_urllib_request.urlopen(request).read()
799 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
800 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
803 # Extract URL, uploader, and title from webpage
804 self.report_extraction(video_id)
805 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
807 self._downloader.trouble(u'ERROR: unable to extract media URL')
809 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
813 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
815 self._downloader.trouble(u'ERROR: unable to extract title')
817 video_title = mobj.group(1).decode('utf-8')
819 video_uploader = mobj.group(2).decode('utf-8')
822 'id': video_id.decode('utf-8'),
823 'url': video_url.decode('utf-8'),
824 'uploader': video_uploader,
826 'title': video_title,
827 'ext': video_extension.decode('utf-8'),
831 class YahooIE(InfoExtractor):
832 """Information extractor for video.yahoo.com."""
835 # _VALID_URL matches all Yahoo! Video URLs
836 # _VPAGE_URL matches only the extractable '/watch/' URLs
837 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
838 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
839 IE_NAME = u'video.yahoo'
841 def __init__(self, downloader=None):
842 InfoExtractor.__init__(self, downloader)
844 def report_download_webpage(self, video_id):
845 """Report webpage download."""
846 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
848 def report_extraction(self, video_id):
849 """Report information extraction."""
850 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
852 def _real_extract(self, url, new_video=True):
853 # Extract ID from URL
854 mobj = re.match(self._VALID_URL, url)
856 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
859 video_id = mobj.group(2)
860 video_extension = 'flv'
862 # Rewrite valid but non-extractable URLs as
863 # extractable English language /watch/ URLs
864 if re.match(self._VPAGE_URL, url) is None:
865 request = compat_urllib_request.Request(url)
867 webpage = compat_urllib_request.urlopen(request).read()
868 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
869 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
872 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
874 self._downloader.trouble(u'ERROR: Unable to extract id field')
876 yahoo_id = mobj.group(1)
878 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
880 self._downloader.trouble(u'ERROR: Unable to extract vid field')
882 yahoo_vid = mobj.group(1)
884 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
885 return self._real_extract(url, new_video=False)
887 # Retrieve video webpage to extract further information
888 request = compat_urllib_request.Request(url)
890 self.report_download_webpage(video_id)
891 webpage = compat_urllib_request.urlopen(request).read()
892 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
893 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
896 # Extract uploader and title from webpage
897 self.report_extraction(video_id)
898 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
900 self._downloader.trouble(u'ERROR: unable to extract video title')
902 video_title = mobj.group(1).decode('utf-8')
904 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
906 self._downloader.trouble(u'ERROR: unable to extract video uploader')
908 video_uploader = mobj.group(1).decode('utf-8')
910 # Extract video thumbnail
911 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
913 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
915 video_thumbnail = mobj.group(1).decode('utf-8')
917 # Extract video description
918 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
920 self._downloader.trouble(u'ERROR: unable to extract video description')
922 video_description = mobj.group(1).decode('utf-8')
923 if not video_description:
924 video_description = 'No description available.'
926 # Extract video height and width
927 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
929 self._downloader.trouble(u'ERROR: unable to extract video height')
931 yv_video_height = mobj.group(1)
933 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
935 self._downloader.trouble(u'ERROR: unable to extract video width')
937 yv_video_width = mobj.group(1)
939 # Retrieve video playlist to extract media URL
940 # I'm not completely sure what all these options are, but we
941 # seem to need most of them, otherwise the server sends a 401.
942 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
943 yv_bitrate = '700' # according to Wikipedia this is hard-coded
944 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
945 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
946 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
948 self.report_download_webpage(video_id)
949 webpage = compat_urllib_request.urlopen(request).read()
950 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
951 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
954 # Extract media URL from playlist XML
955 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
957 self._downloader.trouble(u'ERROR: Unable to extract media URL')
959 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
960 video_url = unescapeHTML(video_url)
963 'id': video_id.decode('utf-8'),
965 'uploader': video_uploader,
967 'title': video_title,
968 'ext': video_extension.decode('utf-8'),
969 'thumbnail': video_thumbnail.decode('utf-8'),
970 'description': video_description,
974 class VimeoIE(InfoExtractor):
975 """Information extractor for vimeo.com."""
977 # _VALID_URL matches Vimeo URLs
978 _VALID_URL = r'(?:https?://)?(?:(?:www|player).)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?:videos?/)?([0-9]+)'
981 def __init__(self, downloader=None):
982 InfoExtractor.__init__(self, downloader)
984 def report_download_webpage(self, video_id):
985 """Report webpage download."""
986 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
988 def report_extraction(self, video_id):
989 """Report information extraction."""
990 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
992 def _real_extract(self, url, new_video=True):
993 # Extract ID from URL
994 mobj = re.match(self._VALID_URL, url)
996 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
999 video_id = mobj.group(1)
1001 # Retrieve video webpage to extract further information
1002 request = compat_urllib_request.Request(url, None, std_headers)
1004 self.report_download_webpage(video_id)
1005 webpage_bytes = compat_urllib_request.urlopen(request).read()
1006 webpage = webpage_bytes.decode('utf-8')
1007 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1008 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1011 # Now we begin extracting as much information as we can from what we
1012 # retrieved. First we extract the information common to all extractors,
1013 # and latter we extract those that are Vimeo specific.
1014 self.report_extraction(video_id)
1016 # Extract the config JSON
1018 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1019 config = json.loads(config)
1021 self._downloader.trouble(u'ERROR: unable to extract info section')
1025 video_title = config["video"]["title"]
1027 # Extract uploader and uploader_id
1028 video_uploader = config["video"]["owner"]["name"]
1029 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1031 # Extract video thumbnail
1032 video_thumbnail = config["video"]["thumbnail"]
1034 # Extract video description
1035 video_description = get_element_by_attribute("itemprop", "description", webpage)
1036 if video_description: video_description = clean_html(video_description)
1037 else: video_description = ''
1039 # Extract upload date
1040 video_upload_date = None
1041 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1042 if mobj is not None:
1043 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1045 # Vimeo specific: extract request signature and timestamp
1046 sig = config['request']['signature']
1047 timestamp = config['request']['timestamp']
1049 # Vimeo specific: extract video codec and quality information
1050 # First consider quality, then codecs, then take everything
1051 # TODO bind to format param
1052 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1053 files = { 'hd': [], 'sd': [], 'other': []}
1054 for codec_name, codec_extension in codecs:
1055 if codec_name in config["video"]["files"]:
1056 if 'hd' in config["video"]["files"][codec_name]:
1057 files['hd'].append((codec_name, codec_extension, 'hd'))
1058 elif 'sd' in config["video"]["files"][codec_name]:
1059 files['sd'].append((codec_name, codec_extension, 'sd'))
1061 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1063 for quality in ('hd', 'sd', 'other'):
1064 if len(files[quality]) > 0:
1065 video_quality = files[quality][0][2]
1066 video_codec = files[quality][0][0]
1067 video_extension = files[quality][0][1]
1068 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1071 self._downloader.trouble(u'ERROR: no known codec found')
1074 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1075 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1080 'uploader': video_uploader,
1081 'uploader_id': video_uploader_id,
1082 'upload_date': video_upload_date,
1083 'title': video_title,
1084 'ext': video_extension,
1085 'thumbnail': video_thumbnail,
1086 'description': video_description,
1090 class ArteTvIE(InfoExtractor):
1091 """arte.tv information extractor."""
1093 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1094 _LIVE_URL = r'index-[0-9]+\.html$'
1096 IE_NAME = u'arte.tv'
1098 def __init__(self, downloader=None):
1099 InfoExtractor.__init__(self, downloader)
1101 def report_download_webpage(self, video_id):
1102 """Report webpage download."""
1103 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1105 def report_extraction(self, video_id):
1106 """Report information extraction."""
1107 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1109 def fetch_webpage(self, url):
1110 request = compat_urllib_request.Request(url)
1112 self.report_download_webpage(url)
1113 webpage = compat_urllib_request.urlopen(request).read()
1114 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1115 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1117 except ValueError as err:
1118 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1122 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1123 page = self.fetch_webpage(url)
1124 mobj = re.search(regex, page, regexFlags)
1128 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1131 for (i, key, err) in matchTuples:
1132 if mobj.group(i) is None:
1133 self._downloader.trouble(err)
1136 info[key] = mobj.group(i)
1140 def extractLiveStream(self, url):
1141 video_lang = url.split('/')[-4]
1142 info = self.grep_webpage(
1144 r'src="(.*?/videothek_js.*?\.js)',
1147 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1150 http_host = url.split('/')[2]
1151 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1152 info = self.grep_webpage(
1154 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1155 '(http://.*?\.swf).*?' +
1159 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1160 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1161 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1164 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1166 def extractPlus7Stream(self, url):
1167 video_lang = url.split('/')[-3]
1168 info = self.grep_webpage(
1170 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1173 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1176 next_url = compat_urllib_parse.unquote(info.get('url'))
1177 info = self.grep_webpage(
1179 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1182 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1185 next_url = compat_urllib_parse.unquote(info.get('url'))
1187 info = self.grep_webpage(
1189 r'<video id="(.*?)".*?>.*?' +
1190 '<name>(.*?)</name>.*?' +
1191 '<dateVideo>(.*?)</dateVideo>.*?' +
1192 '<url quality="hd">(.*?)</url>',
1195 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1196 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1197 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1198 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1203 'id': info.get('id'),
1204 'url': compat_urllib_parse.unquote(info.get('url')),
1205 'uploader': u'arte.tv',
1206 'upload_date': info.get('date'),
1207 'title': info.get('title').decode('utf-8'),
1213 def _real_extract(self, url):
1214 video_id = url.split('/')[-1]
1215 self.report_extraction(video_id)
1217 if re.search(self._LIVE_URL, video_id) is not None:
1218 self.extractLiveStream(url)
1221 info = self.extractPlus7Stream(url)
1226 class GenericIE(InfoExtractor):
1227 """Generic last-resort information extractor."""
1230 IE_NAME = u'generic'
1232 def __init__(self, downloader=None):
1233 InfoExtractor.__init__(self, downloader)
1235 def report_download_webpage(self, video_id):
1236 """Report webpage download."""
1237 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1238 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1240 def report_extraction(self, video_id):
1241 """Report information extraction."""
1242 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1244 def report_following_redirect(self, new_url):
1245 """Report information extraction."""
1246 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1248 def _test_redirect(self, url):
1249 """Check if it is a redirect, like url shorteners, in case restart chain."""
1250 class HeadRequest(compat_urllib_request.Request):
1251 def get_method(self):
1254 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1256 Subclass the HTTPRedirectHandler to make it use our
1257 HeadRequest also on the redirected URL
1259 def redirect_request(self, req, fp, code, msg, headers, newurl):
1260 if code in (301, 302, 303, 307):
1261 newurl = newurl.replace(' ', '%20')
1262 newheaders = dict((k,v) for k,v in req.headers.items()
1263 if k.lower() not in ("content-length", "content-type"))
1264 return HeadRequest(newurl,
1266 origin_req_host=req.get_origin_req_host(),
1269 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1271 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1273 Fallback to GET if HEAD is not allowed (405 HTTP error)
1275 def http_error_405(self, req, fp, code, msg, headers):
1279 newheaders = dict((k,v) for k,v in req.headers.items()
1280 if k.lower() not in ("content-length", "content-type"))
1281 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1283 origin_req_host=req.get_origin_req_host(),
1287 opener = compat_urllib_request.OpenerDirector()
1288 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1289 HTTPMethodFallback, HEADRedirectHandler,
1290 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1291 opener.add_handler(handler())
1293 response = opener.open(HeadRequest(url))
1294 new_url = response.geturl()
1299 self.report_following_redirect(new_url)
1300 self._downloader.download([new_url])
1303 def _real_extract(self, url):
1304 if self._test_redirect(url): return
1306 video_id = url.split('/')[-1]
1307 request = compat_urllib_request.Request(url)
1309 self.report_download_webpage(video_id)
1310 webpage = compat_urllib_request.urlopen(request).read()
1311 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1312 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1314 except ValueError as err:
1315 # since this is the last-resort InfoExtractor, if
1316 # this error is thrown, it'll be thrown here
1317 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1320 self.report_extraction(video_id)
1321 # Start with something easy: JW Player in SWFObject
1322 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1324 # Broaden the search a little bit
1325 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1327 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1330 # It's possible that one of the regexes
1331 # matched, but returned an empty group:
1332 if mobj.group(1) is None:
1333 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1336 video_url = compat_urllib_parse.unquote(mobj.group(1))
1337 video_id = os.path.basename(video_url)
1339 # here's a fun little line of code for you:
1340 video_extension = os.path.splitext(video_id)[1][1:]
1341 video_id = os.path.splitext(video_id)[0]
1343 # it's tempting to parse this further, but you would
1344 # have to take into account all the variations like
1345 # Video Title - Site Name
1346 # Site Name | Video Title
1347 # Video Title - Tagline | Site Name
1348 # and so on and so forth; it's just not practical
1349 mobj = re.search(r'<title>(.*)</title>', webpage)
1351 self._downloader.trouble(u'ERROR: unable to extract title')
1353 video_title = mobj.group(1)
1355 # video uploader is domain name
1356 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1358 self._downloader.trouble(u'ERROR: unable to extract title')
1360 video_uploader = mobj.group(1)
1365 'uploader': video_uploader,
1366 'upload_date': None,
1367 'title': video_title,
1368 'ext': video_extension,
1372 class YoutubeSearchIE(InfoExtractor):
1373 """Information Extractor for YouTube search queries."""
1374 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1375 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1376 _max_youtube_results = 1000
1377 IE_NAME = u'youtube:search'
1379 def __init__(self, downloader=None):
1380 InfoExtractor.__init__(self, downloader)
1382 def report_download_page(self, query, pagenum):
1383 """Report attempt to download search page with given number."""
1384 query = query.decode(preferredencoding())
1385 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1387 def _real_extract(self, query):
1388 mobj = re.match(self._VALID_URL, query)
1390 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1393 prefix, query = query.split(':')
1395 query = query.encode('utf-8')
1397 self._download_n_results(query, 1)
1399 elif prefix == 'all':
1400 self._download_n_results(query, self._max_youtube_results)
1406 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1408 elif n > self._max_youtube_results:
1409 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1410 n = self._max_youtube_results
1411 self._download_n_results(query, n)
1413 except ValueError: # parsing prefix as integer fails
1414 self._download_n_results(query, 1)
1417 def _download_n_results(self, query, n):
1418 """Downloads a specified number of results for a query"""
1424 while (50 * pagenum) < limit:
1425 self.report_download_page(query, pagenum+1)
1426 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1427 request = compat_urllib_request.Request(result_url)
1429 data = compat_urllib_request.urlopen(request).read()
1430 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1431 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1433 api_response = json.loads(data)['data']
1435 new_ids = list(video['id'] for video in api_response['items'])
1436 video_ids += new_ids
1438 limit = min(n, api_response['totalItems'])
1441 if len(video_ids) > n:
1442 video_ids = video_ids[:n]
1443 for id in video_ids:
1444 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1448 class GoogleSearchIE(InfoExtractor):
1449 """Information Extractor for Google Video search queries."""
1450 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1451 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1452 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1453 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1454 _max_google_results = 1000
1455 IE_NAME = u'video.google:search'
1457 def __init__(self, downloader=None):
1458 InfoExtractor.__init__(self, downloader)
1460 def report_download_page(self, query, pagenum):
1461 """Report attempt to download playlist page with given number."""
1462 query = query.decode(preferredencoding())
1463 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1465 def _real_extract(self, query):
1466 mobj = re.match(self._VALID_URL, query)
1468 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1471 prefix, query = query.split(':')
1473 query = query.encode('utf-8')
1475 self._download_n_results(query, 1)
1477 elif prefix == 'all':
1478 self._download_n_results(query, self._max_google_results)
1484 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1486 elif n > self._max_google_results:
1487 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1488 n = self._max_google_results
1489 self._download_n_results(query, n)
1491 except ValueError: # parsing prefix as integer fails
1492 self._download_n_results(query, 1)
1495 def _download_n_results(self, query, n):
1496 """Downloads a specified number of results for a query"""
1502 self.report_download_page(query, pagenum)
1503 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1504 request = compat_urllib_request.Request(result_url)
1506 page = compat_urllib_request.urlopen(request).read()
1507 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1508 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1511 # Extract video identifiers
1512 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1513 video_id = mobj.group(1)
1514 if video_id not in video_ids:
1515 video_ids.append(video_id)
1516 if len(video_ids) == n:
1517 # Specified n videos reached
1518 for id in video_ids:
1519 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1522 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1523 for id in video_ids:
1524 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1527 pagenum = pagenum + 1
1530 class YahooSearchIE(InfoExtractor):
1531 """Information Extractor for Yahoo! Video search queries."""
1534 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1535 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1536 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1537 _MORE_PAGES_INDICATOR = r'\s*Next'
1538 _max_yahoo_results = 1000
1539 IE_NAME = u'video.yahoo:search'
1541 def __init__(self, downloader=None):
1542 InfoExtractor.__init__(self, downloader)
1544 def report_download_page(self, query, pagenum):
1545 """Report attempt to download playlist page with given number."""
1546 query = query.decode(preferredencoding())
1547 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1549 def _real_extract(self, query):
1550 mobj = re.match(self._VALID_URL, query)
1552 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1555 prefix, query = query.split(':')
1557 query = query.encode('utf-8')
1559 self._download_n_results(query, 1)
1561 elif prefix == 'all':
1562 self._download_n_results(query, self._max_yahoo_results)
1568 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1570 elif n > self._max_yahoo_results:
1571 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1572 n = self._max_yahoo_results
1573 self._download_n_results(query, n)
1575 except ValueError: # parsing prefix as integer fails
1576 self._download_n_results(query, 1)
1579 def _download_n_results(self, query, n):
1580 """Downloads a specified number of results for a query"""
1583 already_seen = set()
1587 self.report_download_page(query, pagenum)
1588 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1589 request = compat_urllib_request.Request(result_url)
1591 page = compat_urllib_request.urlopen(request).read()
1592 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1593 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1596 # Extract video identifiers
1597 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1598 video_id = mobj.group(1)
1599 if video_id not in already_seen:
1600 video_ids.append(video_id)
1601 already_seen.add(video_id)
1602 if len(video_ids) == n:
1603 # Specified n videos reached
1604 for id in video_ids:
1605 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1608 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1609 for id in video_ids:
1610 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1613 pagenum = pagenum + 1
1616 class YoutubePlaylistIE(InfoExtractor):
1617 """Information Extractor for YouTube playlists."""
1619 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1620 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1621 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1622 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1623 IE_NAME = u'youtube:playlist'
1625 def __init__(self, downloader=None):
1626 InfoExtractor.__init__(self, downloader)
1628 def report_download_page(self, playlist_id, pagenum):
1629 """Report attempt to download playlist page with given number."""
1630 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1632 def _real_extract(self, url):
1633 # Extract playlist id
1634 mobj = re.match(self._VALID_URL, url)
1636 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1640 if mobj.group(3) is not None:
1641 self._downloader.download([mobj.group(3)])
1644 # Download playlist pages
1645 # prefix is 'p' as default for playlists but there are other types that need extra care
1646 playlist_prefix = mobj.group(1)
1647 if playlist_prefix == 'a':
1648 playlist_access = 'artist'
1650 playlist_prefix = 'p'
1651 playlist_access = 'view_play_list'
1652 playlist_id = mobj.group(2)
1657 self.report_download_page(playlist_id, pagenum)
1658 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1659 request = compat_urllib_request.Request(url)
1661 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1662 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1663 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1666 # Extract video identifiers
1668 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1669 if mobj.group(1) not in ids_in_page:
1670 ids_in_page.append(mobj.group(1))
1671 video_ids.extend(ids_in_page)
1673 if self._MORE_PAGES_INDICATOR not in page:
1675 pagenum = pagenum + 1
1677 total = len(video_ids)
1679 playliststart = self._downloader.params.get('playliststart', 1) - 1
1680 playlistend = self._downloader.params.get('playlistend', -1)
1681 if playlistend == -1:
1682 video_ids = video_ids[playliststart:]
1684 video_ids = video_ids[playliststart:playlistend]
1686 if len(video_ids) == total:
1687 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1689 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1691 for id in video_ids:
1692 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1696 class YoutubeChannelIE(InfoExtractor):
1697 """Information Extractor for YouTube channels."""
1699 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1700 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1701 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1702 IE_NAME = u'youtube:channel'
1704 def report_download_page(self, channel_id, pagenum):
1705 """Report attempt to download channel page with given number."""
1706 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1708 def _real_extract(self, url):
1709 # Extract channel id
1710 mobj = re.match(self._VALID_URL, url)
1712 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1715 # Download channel pages
1716 channel_id = mobj.group(1)
1721 self.report_download_page(channel_id, pagenum)
1722 url = self._TEMPLATE_URL % (channel_id, pagenum)
1723 request = compat_urllib_request.Request(url)
1725 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1726 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1727 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1730 # Extract video identifiers
1732 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1733 if mobj.group(1) not in ids_in_page:
1734 ids_in_page.append(mobj.group(1))
1735 video_ids.extend(ids_in_page)
1737 if self._MORE_PAGES_INDICATOR not in page:
1739 pagenum = pagenum + 1
1741 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1743 for id in video_ids:
1744 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1748 class YoutubeUserIE(InfoExtractor):
1749 """Information Extractor for YouTube users."""
1751 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1752 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1753 _GDATA_PAGE_SIZE = 50
1754 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1755 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1756 IE_NAME = u'youtube:user'
1758 def __init__(self, downloader=None):
1759 InfoExtractor.__init__(self, downloader)
1761 def report_download_page(self, username, start_index):
1762 """Report attempt to download user page."""
1763 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1764 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1766 def _real_extract(self, url):
1768 mobj = re.match(self._VALID_URL, url)
1770 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1773 username = mobj.group(1)
1775 # Download video ids using YouTube Data API. Result size per
1776 # query is limited (currently to 50 videos) so we need to query
1777 # page by page until there are no video ids - it means we got
1784 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1785 self.report_download_page(username, start_index)
1787 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1790 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1791 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1792 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1795 # Extract video identifiers
1798 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1799 if mobj.group(1) not in ids_in_page:
1800 ids_in_page.append(mobj.group(1))
1802 video_ids.extend(ids_in_page)
1804 # A little optimization - if current page is not
1805 # "full", ie. does not contain PAGE_SIZE video ids then
1806 # we can assume that this page is the last one - there
1807 # are no more ids on further pages - no need to query
1810 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1815 all_ids_count = len(video_ids)
1816 playliststart = self._downloader.params.get('playliststart', 1) - 1
1817 playlistend = self._downloader.params.get('playlistend', -1)
1819 if playlistend == -1:
1820 video_ids = video_ids[playliststart:]
1822 video_ids = video_ids[playliststart:playlistend]
1824 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1825 (username, all_ids_count, len(video_ids)))
1827 for video_id in video_ids:
1828 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1831 class BlipTVUserIE(InfoExtractor):
1832 """Information Extractor for blip.tv users."""
1834 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1836 IE_NAME = u'blip.tv:user'
1838 def __init__(self, downloader=None):
1839 InfoExtractor.__init__(self, downloader)
1841 def report_download_page(self, username, pagenum):
1842 """Report attempt to download user page."""
1843 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1844 (self.IE_NAME, username, pagenum))
1846 def _real_extract(self, url):
1848 mobj = re.match(self._VALID_URL, url)
1850 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1853 username = mobj.group(1)
1855 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1857 request = compat_urllib_request.Request(url)
1860 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1861 mobj = re.search(r'data-users-id="([^"]+)"', page)
1862 page_base = page_base % mobj.group(1)
1863 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1864 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1868 # Download video ids using BlipTV Ajax calls. Result size per
1869 # query is limited (currently to 12 videos) so we need to query
1870 # page by page until there are no video ids - it means we got
1877 self.report_download_page(username, pagenum)
1879 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1882 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1883 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1884 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1887 # Extract video identifiers
1890 for mobj in re.finditer(r'href="/([^"]+)"', page):
1891 if mobj.group(1) not in ids_in_page:
1892 ids_in_page.append(unescapeHTML(mobj.group(1)))
1894 video_ids.extend(ids_in_page)
1896 # A little optimization - if current page is not
1897 # "full", ie. does not contain PAGE_SIZE video ids then
1898 # we can assume that this page is the last one - there
1899 # are no more ids on further pages - no need to query
1902 if len(ids_in_page) < self._PAGE_SIZE:
1907 all_ids_count = len(video_ids)
1908 playliststart = self._downloader.params.get('playliststart', 1) - 1
1909 playlistend = self._downloader.params.get('playlistend', -1)
1911 if playlistend == -1:
1912 video_ids = video_ids[playliststart:]
1914 video_ids = video_ids[playliststart:playlistend]
1916 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1917 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1919 for video_id in video_ids:
1920 self._downloader.download([u'http://blip.tv/'+video_id])
1923 class DepositFilesIE(InfoExtractor):
1924 """Information extractor for depositfiles.com"""
1926 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1928 def report_download_webpage(self, file_id):
1929 """Report webpage download."""
1930 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1932 def report_extraction(self, file_id):
1933 """Report information extraction."""
1934 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1936 def _real_extract(self, url):
1937 file_id = url.split('/')[-1]
1938 # Rebuild url in english locale
1939 url = 'http://depositfiles.com/en/files/' + file_id
1941 # Retrieve file webpage with 'Free download' button pressed
1942 free_download_indication = { 'gateway_result' : '1' }
1943 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1945 self.report_download_webpage(file_id)
1946 webpage = compat_urllib_request.urlopen(request).read()
1947 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1948 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1951 # Search for the real file URL
1952 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1953 if (mobj is None) or (mobj.group(1) is None):
1954 # Try to figure out reason of the error.
1955 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1956 if (mobj is not None) and (mobj.group(1) is not None):
1957 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1958 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1960 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1963 file_url = mobj.group(1)
1964 file_extension = os.path.splitext(file_url)[1][1:]
1966 # Search for file title
1967 mobj = re.search(r'<b title="(.*?)">', webpage)
1969 self._downloader.trouble(u'ERROR: unable to extract title')
1971 file_title = mobj.group(1).decode('utf-8')
1974 'id': file_id.decode('utf-8'),
1975 'url': file_url.decode('utf-8'),
1977 'upload_date': None,
1978 'title': file_title,
1979 'ext': file_extension.decode('utf-8'),
1983 class FacebookIE(InfoExtractor):
1984 """Information Extractor for Facebook"""
1987 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
1988 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
1989 _NETRC_MACHINE = 'facebook'
1990 _available_formats = ['video', 'highqual', 'lowqual']
1991 _video_extensions = {
1996 IE_NAME = u'facebook'
1998 def __init__(self, downloader=None):
1999 InfoExtractor.__init__(self, downloader)
2001 def _reporter(self, message):
2002 """Add header and report message."""
2003 self._downloader.to_screen(u'[facebook] %s' % message)
2005 def report_login(self):
2006 """Report attempt to log in."""
2007 self._reporter(u'Logging in')
2009 def report_video_webpage_download(self, video_id):
2010 """Report attempt to download video webpage."""
2011 self._reporter(u'%s: Downloading video webpage' % video_id)
2013 def report_information_extraction(self, video_id):
2014 """Report attempt to extract video information."""
2015 self._reporter(u'%s: Extracting video information' % video_id)
2017 def _parse_page(self, video_webpage):
2018 """Extract video information from page"""
2020 data = {'title': r'\("video_title", "(.*?)"\)',
2021 'description': r'<div class="datawrap">(.*?)</div>',
2022 'owner': r'\("video_owner_name", "(.*?)"\)',
2023 'thumbnail': r'\("thumb_url", "(?P<THUMB>.*?)"\)',
2026 for piece in data.keys():
2027 mobj = re.search(data[piece], video_webpage)
2028 if mobj is not None:
2029 video_info[piece] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2033 for fmt in self._available_formats:
2034 mobj = re.search(r'\("%s_src\", "(.+?)"\)' % fmt, video_webpage)
2035 if mobj is not None:
2036 # URL is in a Javascript segment inside an escaped Unicode format within
2037 # the generally utf-8 page
2038 video_urls[fmt] = compat_urllib_parse.unquote_plus(mobj.group(1).decode("unicode_escape"))
2039 video_info['video_urls'] = video_urls
2043 def _real_initialize(self):
2044 if self._downloader is None:
2049 downloader_params = self._downloader.params
2051 # Attempt to use provided username and password or .netrc data
2052 if downloader_params.get('username', None) is not None:
2053 useremail = downloader_params['username']
2054 password = downloader_params['password']
2055 elif downloader_params.get('usenetrc', False):
2057 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2058 if info is not None:
2062 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2063 except (IOError, netrc.NetrcParseError) as err:
2064 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2067 if useremail is None:
2076 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2079 login_results = compat_urllib_request.urlopen(request).read()
2080 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2081 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2083 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2084 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2087 def _real_extract(self, url):
2088 mobj = re.match(self._VALID_URL, url)
2090 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2092 video_id = mobj.group('ID')
2095 self.report_video_webpage_download(video_id)
2096 request = compat_urllib_request.Request('https://www.facebook.com/video/video.php?v=%s' % video_id)
2098 page = compat_urllib_request.urlopen(request)
2099 video_webpage = page.read()
2100 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2101 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2104 # Start extracting information
2105 self.report_information_extraction(video_id)
2107 # Extract information
2108 video_info = self._parse_page(video_webpage)
2111 if 'owner' not in video_info:
2112 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
2114 video_uploader = video_info['owner']
2117 if 'title' not in video_info:
2118 self._downloader.trouble(u'ERROR: unable to extract video title')
2120 video_title = video_info['title']
2121 video_title = video_title.decode('utf-8')
2124 if 'thumbnail' not in video_info:
2125 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
2126 video_thumbnail = ''
2128 video_thumbnail = video_info['thumbnail']
2132 if 'upload_date' in video_info:
2133 upload_time = video_info['upload_date']
2134 timetuple = email.utils.parsedate_tz(upload_time)
2135 if timetuple is not None:
2137 upload_date = time.strftime('%Y%m%d', timetuple[0:9])
2142 video_description = video_info.get('description', 'No description available.')
2144 url_map = video_info['video_urls']
2146 # Decide which formats to download
2147 req_format = self._downloader.params.get('format', None)
2148 format_limit = self._downloader.params.get('format_limit', None)
2150 if format_limit is not None and format_limit in self._available_formats:
2151 format_list = self._available_formats[self._available_formats.index(format_limit):]
2153 format_list = self._available_formats
2154 existing_formats = [x for x in format_list if x in url_map]
2155 if len(existing_formats) == 0:
2156 self._downloader.trouble(u'ERROR: no known formats available for video')
2158 if req_format is None:
2159 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
2160 elif req_format == 'worst':
2161 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
2162 elif req_format == '-1':
2163 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
2166 if req_format not in url_map:
2167 self._downloader.trouble(u'ERROR: requested format not available')
2169 video_url_list = [(req_format, url_map[req_format])] # Specific format
2172 for format_param, video_real_url in video_url_list:
2174 video_extension = self._video_extensions.get(format_param, 'mp4')
2177 'id': video_id.decode('utf-8'),
2178 'url': video_real_url.decode('utf-8'),
2179 'uploader': video_uploader.decode('utf-8'),
2180 'upload_date': upload_date,
2181 'title': video_title,
2182 'ext': video_extension.decode('utf-8'),
2183 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2184 'thumbnail': video_thumbnail.decode('utf-8'),
2185 'description': video_description.decode('utf-8'),
2189 class BlipTVIE(InfoExtractor):
2190 """Information extractor for blip.tv"""
2192 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2193 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2194 IE_NAME = u'blip.tv'
2196 def report_extraction(self, file_id):
2197 """Report information extraction."""
2198 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2200 def report_direct_download(self, title):
2201 """Report information extraction."""
2202 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2204 def _real_extract(self, url):
2205 mobj = re.match(self._VALID_URL, url)
2207 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2214 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2215 request = compat_urllib_request.Request(json_url)
2216 self.report_extraction(mobj.group(1))
2219 urlh = compat_urllib_request.urlopen(request)
2220 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2221 basename = url.split('/')[-1]
2222 title,ext = os.path.splitext(basename)
2223 title = title.decode('UTF-8')
2224 ext = ext.replace('.', '')
2225 self.report_direct_download(title)
2230 'upload_date': None,
2235 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2236 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2238 if info is None: # Regular URL
2240 json_code_bytes = urlh.read()
2241 json_code = json_code_bytes.decode('utf-8')
2242 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2243 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2247 json_data = json.loads(json_code)
2248 if 'Post' in json_data:
2249 data = json_data['Post']
2253 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2254 video_url = data['media']['url']
2255 umobj = re.match(self._URL_EXT, video_url)
2257 raise ValueError('Can not determine filename extension')
2258 ext = umobj.group(1)
2261 'id': data['item_id'],
2263 'uploader': data['display_name'],
2264 'upload_date': upload_date,
2265 'title': data['title'],
2267 'format': data['media']['mimeType'],
2268 'thumbnail': data['thumbnailUrl'],
2269 'description': data['description'],
2270 'player_url': data['embedUrl']
2272 except (ValueError,KeyError) as err:
2273 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2276 std_headers['User-Agent'] = 'iTunes/10.6.1'
2280 class MyVideoIE(InfoExtractor):
2281 """Information Extractor for myvideo.de."""
2283 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2284 IE_NAME = u'myvideo'
2286 def __init__(self, downloader=None):
2287 InfoExtractor.__init__(self, downloader)
2289 def report_download_webpage(self, video_id):
2290 """Report webpage download."""
2291 self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
2293 def report_extraction(self, video_id):
2294 """Report information extraction."""
2295 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2297 def _real_extract(self,url):
2298 mobj = re.match(self._VALID_URL, url)
2300 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2303 video_id = mobj.group(1)
2306 request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
2308 self.report_download_webpage(video_id)
2309 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
2310 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2311 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
2314 self.report_extraction(video_id)
2315 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
2318 self._downloader.trouble(u'ERROR: unable to extract media URL')
2320 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2322 mobj = re.search('<title>([^<]+)</title>', webpage)
2324 self._downloader.trouble(u'ERROR: unable to extract title')
2327 video_title = mobj.group(1)
2333 'upload_date': None,
2334 'title': video_title,
2338 class ComedyCentralIE(InfoExtractor):
2339 """Information extractor for The Daily Show and Colbert Report """
2341 # urls can be abbreviations like :thedailyshow or :colbert
2342 # urls for episodes like:
2343 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2344 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2345 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2346 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2347 |(https?://)?(www\.)?
2348 (?P<showname>thedailyshow|colbertnation)\.com/
2349 (full-episodes/(?P<episode>.*)|
2351 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2352 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2354 IE_NAME = u'comedycentral'
2356 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2358 _video_extensions = {
2366 _video_dimensions = {
2375 def suitable(self, url):
2376 """Receives a URL and returns True if suitable for this IE."""
2377 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2379 def report_extraction(self, episode_id):
2380 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2382 def report_config_download(self, episode_id):
2383 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration' % episode_id)
2385 def report_index_download(self, episode_id):
2386 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2388 def report_player_url(self, episode_id):
2389 self._downloader.to_screen(u'[comedycentral] %s: Determining player URL' % episode_id)
2392 def _print_formats(self, formats):
2393 print('Available formats:')
2395 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2398 def _real_extract(self, url):
2399 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2401 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2404 if mobj.group('shortname'):
2405 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2406 url = u'http://www.thedailyshow.com/full-episodes/'
2408 url = u'http://www.colbertnation.com/full-episodes/'
2409 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2410 assert mobj is not None
2412 if mobj.group('clip'):
2413 if mobj.group('showname') == 'thedailyshow':
2414 epTitle = mobj.group('tdstitle')
2416 epTitle = mobj.group('cntitle')
2419 dlNewest = not mobj.group('episode')
2421 epTitle = mobj.group('showname')
2423 epTitle = mobj.group('episode')
2425 req = compat_urllib_request.Request(url)
2426 self.report_extraction(epTitle)
2428 htmlHandle = compat_urllib_request.urlopen(req)
2429 html = htmlHandle.read()
2430 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2431 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2434 url = htmlHandle.geturl()
2435 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2437 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2439 if mobj.group('episode') == '':
2440 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2442 epTitle = mobj.group('episode')
2444 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', html)
2446 if len(mMovieParams) == 0:
2447 # The Colbert Report embeds the information in a without
2448 # a URL prefix; so extract the alternate reference
2449 # and then add the URL prefix manually.
2451 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', html)
2452 if len(altMovieParams) == 0:
2453 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2456 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2458 playerUrl_raw = mMovieParams[0][0]
2459 self.report_player_url(epTitle)
2461 urlHandle = compat_urllib_request.urlopen(playerUrl_raw)
2462 playerUrl = urlHandle.geturl()
2463 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2464 self._downloader.trouble(u'ERROR: unable to find out player URL: ' + compat_str(err))
2467 uri = mMovieParams[0][1]
2468 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2469 self.report_index_download(epTitle)
2471 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2472 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2473 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2478 idoc = xml.etree.ElementTree.fromstring(indexXml)
2479 itemEls = idoc.findall('.//item')
2480 for itemEl in itemEls:
2481 mediaId = itemEl.findall('./guid')[0].text
2482 shortMediaId = mediaId.split(':')[-1]
2483 showId = mediaId.split(':')[-2].replace('.com', '')
2484 officialTitle = itemEl.findall('./title')[0].text
2485 officialDate = itemEl.findall('./pubDate')[0].text
2487 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2488 compat_urllib_parse.urlencode({'uri': mediaId}))
2489 configReq = compat_urllib_request.Request(configUrl)
2490 self.report_config_download(epTitle)
2492 configXml = compat_urllib_request.urlopen(configReq).read()
2493 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2494 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2497 cdoc = xml.etree.ElementTree.fromstring(configXml)
2499 for rendition in cdoc.findall('.//rendition'):
2500 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2504 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2507 if self._downloader.params.get('listformats', None):
2508 self._print_formats([i[0] for i in turls])
2511 # For now, just pick the highest bitrate
2512 format,video_url = turls[-1]
2514 # Get the format arg from the arg stream
2515 req_format = self._downloader.params.get('format', None)
2517 # Select format if we can find one
2520 format, video_url = f, v
2523 # Patch to download from alternative CDN, which does not
2524 # break on current RTMPDump builds
2525 broken_cdn = "rtmpe://viacomccstrmfs.fplive.net/viacomccstrm/gsp.comedystor/"
2526 better_cdn = "rtmpe://cp10740.edgefcs.net/ondemand/mtvnorigin/gsp.comedystor/"
2528 if video_url.startswith(broken_cdn):
2529 video_url = video_url.replace(broken_cdn, better_cdn)
2531 effTitle = showId + u'-' + epTitle
2536 'upload_date': officialDate,
2541 'description': officialTitle,
2542 'player_url': None #playerUrl
2545 results.append(info)
2550 class EscapistIE(InfoExtractor):
2551 """Information extractor for The Escapist """
2553 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2554 IE_NAME = u'escapist'
2556 def report_extraction(self, showName):
2557 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2559 def report_config_download(self, showName):
2560 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2562 def _real_extract(self, url):
2563 mobj = re.match(self._VALID_URL, url)
2565 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2567 showName = mobj.group('showname')
2568 videoId = mobj.group('episode')
2570 self.report_extraction(showName)
2572 webPage = compat_urllib_request.urlopen(url)
2573 webPageBytes = webPage.read()
2574 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2575 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2576 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2577 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2580 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2581 description = unescapeHTML(descMatch.group(1))
2582 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2583 imgUrl = unescapeHTML(imgMatch.group(1))
2584 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2585 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2586 configUrlMatch = re.search('config=(.*)$', playerUrl)
2587 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2589 self.report_config_download(showName)
2591 configJSON = compat_urllib_request.urlopen(configUrl)
2592 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2593 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2594 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2595 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2598 # Technically, it's JavaScript, not JSON
2599 configJSON = configJSON.replace("'", '"')
2602 config = json.loads(configJSON)
2603 except (ValueError,) as err:
2604 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2607 playlist = config['playlist']
2608 videoUrl = playlist[1]['url']
2613 'uploader': showName,
2614 'upload_date': None,
2617 'thumbnail': imgUrl,
2618 'description': description,
2619 'player_url': playerUrl,
2625 class CollegeHumorIE(InfoExtractor):
2626 """Information extractor for collegehumor.com"""
2629 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2630 IE_NAME = u'collegehumor'
2632 def report_manifest(self, video_id):
2633 """Report information extraction."""
2634 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2636 def report_extraction(self, video_id):
2637 """Report information extraction."""
2638 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2640 def _real_extract(self, url):
2641 mobj = re.match(self._VALID_URL, url)
2643 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2645 video_id = mobj.group('videoid')
2650 'upload_date': None,
2653 self.report_extraction(video_id)
2654 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2656 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2657 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2658 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2661 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2663 videoNode = mdoc.findall('./video')[0]
2664 info['description'] = videoNode.findall('./description')[0].text
2665 info['title'] = videoNode.findall('./caption')[0].text
2666 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2667 manifest_url = videoNode.findall('./file')[0].text
2669 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2672 manifest_url += '?hdcore=2.10.3'
2673 self.report_manifest(video_id)
2675 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2676 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2677 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2680 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2682 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2683 node_id = media_node.attrib['url']
2684 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2685 except IndexError as err:
2686 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2689 url_pr = compat_urllib_parse_urlparse(manifest_url)
2690 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2697 class XVideosIE(InfoExtractor):
2698 """Information extractor for xvideos.com"""
2700 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2701 IE_NAME = u'xvideos'
2703 def report_webpage(self, video_id):
2704 """Report information extraction."""
2705 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2707 def report_extraction(self, video_id):
2708 """Report information extraction."""
2709 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2711 def _real_extract(self, url):
2712 mobj = re.match(self._VALID_URL, url)
2714 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2716 video_id = mobj.group(1)
2718 self.report_webpage(video_id)
2720 request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
2722 webpage_bytes = compat_urllib_request.urlopen(request).read()
2723 webpage = webpage_bytes.decode('utf-8', 'replace')
2724 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2725 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2728 self.report_extraction(video_id)
2732 mobj = re.search(r'flv_url=(.+?)&', webpage)
2734 self._downloader.trouble(u'ERROR: unable to extract video url')
2736 video_url = compat_urllib_parse.unquote(mobj.group(1))
2740 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2742 self._downloader.trouble(u'ERROR: unable to extract video title')
2744 video_title = mobj.group(1)
2747 # Extract video thumbnail
2748 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2750 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2752 video_thumbnail = mobj.group(0)
2758 'upload_date': None,
2759 'title': video_title,
2761 'thumbnail': video_thumbnail,
2762 'description': None,
2768 class SoundcloudIE(InfoExtractor):
2769 """Information extractor for soundcloud.com
2770 To access the media, the uid of the song and a stream token
2771 must be extracted from the page source and the script must make
2772 a request to media.soundcloud.com/crossdomain.xml. Then
2773 the media can be grabbed by requesting from an url composed
2774 of the stream token and uid
2777 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2778 IE_NAME = u'soundcloud'
2780 def __init__(self, downloader=None):
2781 InfoExtractor.__init__(self, downloader)
2783 def report_resolve(self, video_id):
2784 """Report information extraction."""
2785 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2787 def report_extraction(self, video_id):
2788 """Report information extraction."""
2789 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2791 def _real_extract(self, url):
2792 mobj = re.match(self._VALID_URL, url)
2794 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2797 # extract uploader (which is in the url)
2798 uploader = mobj.group(1)
2799 # extract simple title (uploader + slug of song title)
2800 slug_title = mobj.group(2)
2801 simple_title = uploader + u'-' + slug_title
2803 self.report_resolve('%s/%s' % (uploader, slug_title))
2805 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2806 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2807 request = compat_urllib_request.Request(resolv_url)
2809 info_json_bytes = compat_urllib_request.urlopen(request).read()
2810 info_json = info_json_bytes.decode('utf-8')
2811 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2812 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2815 info = json.loads(info_json)
2816 video_id = info['id']
2817 self.report_extraction('%s/%s' % (uploader, slug_title))
2819 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2820 request = compat_urllib_request.Request(streams_url)
2822 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2823 stream_json = stream_json_bytes.decode('utf-8')
2824 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2825 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2828 streams = json.loads(stream_json)
2829 mediaURL = streams['http_mp3_128_url']
2834 'uploader': info['user']['username'],
2835 'upload_date': info['created_at'],
2836 'title': info['title'],
2838 'description': info['description'],
2842 class InfoQIE(InfoExtractor):
2843 """Information extractor for infoq.com"""
2845 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2848 def report_webpage(self, video_id):
2849 """Report information extraction."""
2850 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
2852 def report_extraction(self, video_id):
2853 """Report information extraction."""
2854 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2856 def _real_extract(self, url):
2857 mobj = re.match(self._VALID_URL, url)
2859 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2862 self.report_webpage(url)
2864 request = compat_urllib_request.Request(url)
2866 webpage = compat_urllib_request.urlopen(request).read()
2867 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2868 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2871 self.report_extraction(url)
2875 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2877 self._downloader.trouble(u'ERROR: unable to extract video url')
2879 video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
2883 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2885 self._downloader.trouble(u'ERROR: unable to extract video title')
2887 video_title = mobj.group(1).decode('utf-8')
2889 # Extract description
2890 video_description = u'No description available.'
2891 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2892 if mobj is not None:
2893 video_description = mobj.group(1).decode('utf-8')
2895 video_filename = video_url.split('/')[-1]
2896 video_id, extension = video_filename.split('.')
2902 'upload_date': None,
2903 'title': video_title,
2904 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2906 'description': video_description,
2911 class MixcloudIE(InfoExtractor):
2912 """Information extractor for www.mixcloud.com"""
2914 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2915 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2916 IE_NAME = u'mixcloud'
2918 def __init__(self, downloader=None):
2919 InfoExtractor.__init__(self, downloader)
2921 def report_download_json(self, file_id):
2922 """Report JSON download."""
2923 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2925 def report_extraction(self, file_id):
2926 """Report information extraction."""
2927 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2929 def get_urls(self, jsonData, fmt, bitrate='best'):
2930 """Get urls from 'audio_formats' section in json"""
2933 bitrate_list = jsonData[fmt]
2934 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2935 bitrate = max(bitrate_list) # select highest
2937 url_list = jsonData[fmt][bitrate]
2938 except TypeError: # we have no bitrate info.
2939 url_list = jsonData[fmt]
2942 def check_urls(self, url_list):
2943 """Returns 1st active url from list"""
2944 for url in url_list:
2946 compat_urllib_request.urlopen(url)
2948 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2953 def _print_formats(self, formats):
2954 print('Available formats:')
2955 for fmt in formats.keys():
2956 for b in formats[fmt]:
2958 ext = formats[fmt][b][0]
2959 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2960 except TypeError: # we have no bitrate info
2961 ext = formats[fmt][0]
2962 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2965 def _real_extract(self, url):
2966 mobj = re.match(self._VALID_URL, url)
2968 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2970 # extract uploader & filename from url
2971 uploader = mobj.group(1).decode('utf-8')
2972 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2974 # construct API request
2975 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2976 # retrieve .json file with links to files
2977 request = compat_urllib_request.Request(file_url)
2979 self.report_download_json(file_url)
2980 jsonData = compat_urllib_request.urlopen(request).read()
2981 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2982 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2986 json_data = json.loads(jsonData)
2987 player_url = json_data['player_swf_url']
2988 formats = dict(json_data['audio_formats'])
2990 req_format = self._downloader.params.get('format', None)
2993 if self._downloader.params.get('listformats', None):
2994 self._print_formats(formats)
2997 if req_format is None or req_format == 'best':
2998 for format_param in formats.keys():
2999 url_list = self.get_urls(formats, format_param)
3001 file_url = self.check_urls(url_list)
3002 if file_url is not None:
3005 if req_format not in formats:
3006 self._downloader.trouble(u'ERROR: format is not available')
3009 url_list = self.get_urls(formats, req_format)
3010 file_url = self.check_urls(url_list)
3011 format_param = req_format
3014 'id': file_id.decode('utf-8'),
3015 'url': file_url.decode('utf-8'),
3016 'uploader': uploader.decode('utf-8'),
3017 'upload_date': None,
3018 'title': json_data['name'],
3019 'ext': file_url.split('.')[-1].decode('utf-8'),
3020 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
3021 'thumbnail': json_data['thumbnail_url'],
3022 'description': json_data['description'],
3023 'player_url': player_url.decode('utf-8'),
3026 class StanfordOpenClassroomIE(InfoExtractor):
3027 """Information extractor for Stanford's Open ClassRoom"""
3029 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
3030 IE_NAME = u'stanfordoc'
3032 def report_download_webpage(self, objid):
3033 """Report information extraction."""
3034 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
3036 def report_extraction(self, video_id):
3037 """Report information extraction."""
3038 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3040 def _real_extract(self, url):
3041 mobj = re.match(self._VALID_URL, url)
3043 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3046 if mobj.group('course') and mobj.group('video'): # A specific video
3047 course = mobj.group('course')
3048 video = mobj.group('video')
3050 'id': course + '_' + video,
3052 'upload_date': None,
3055 self.report_extraction(info['id'])
3056 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
3057 xmlUrl = baseUrl + video + '.xml'
3059 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
3060 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3061 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3063 mdoc = xml.etree.ElementTree.fromstring(metaXml)
3065 info['title'] = mdoc.findall('./title')[0].text
3066 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
3068 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
3070 info['ext'] = info['url'].rpartition('.')[2]
3072 elif mobj.group('course'): # A course page
3073 course = mobj.group('course')
3078 'upload_date': None,
3081 self.report_download_webpage(info['id'])
3083 coursepage = compat_urllib_request.urlopen(url).read()
3084 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3085 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3088 m = re.search('<h1>([^<]+)</h1>', coursepage)
3090 info['title'] = unescapeHTML(m.group(1))
3092 info['title'] = info['id']
3094 m = re.search('<description>([^<]+)</description>', coursepage)
3096 info['description'] = unescapeHTML(m.group(1))
3098 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3101 'type': 'reference',
3102 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3106 for entry in info['list']:
3107 assert entry['type'] == 'reference'
3108 results += self.extract(entry['url'])
3113 'id': 'Stanford OpenClassroom',
3116 'upload_date': None,
3119 self.report_download_webpage(info['id'])
3120 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3122 rootpage = compat_urllib_request.urlopen(rootURL).read()
3123 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3124 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3127 info['title'] = info['id']
3129 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3132 'type': 'reference',
3133 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3138 for entry in info['list']:
3139 assert entry['type'] == 'reference'
3140 results += self.extract(entry['url'])
3143 class MTVIE(InfoExtractor):
3144 """Information extractor for MTV.com"""
3146 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3149 def report_webpage(self, video_id):
3150 """Report information extraction."""
3151 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3153 def report_extraction(self, video_id):
3154 """Report information extraction."""
3155 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3157 def _real_extract(self, url):
3158 mobj = re.match(self._VALID_URL, url)
3160 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3162 if not mobj.group('proto'):
3163 url = 'http://' + url
3164 video_id = mobj.group('videoid')
3165 self.report_webpage(video_id)
3167 request = compat_urllib_request.Request(url)
3169 webpage = compat_urllib_request.urlopen(request).read()
3170 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3171 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
3174 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3176 self._downloader.trouble(u'ERROR: unable to extract song name')
3178 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3179 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3181 self._downloader.trouble(u'ERROR: unable to extract performer')
3183 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3184 video_title = performer + ' - ' + song_name
3186 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3188 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3190 mtvn_uri = mobj.group(1)
3192 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3194 self._downloader.trouble(u'ERROR: unable to extract content id')
3196 content_id = mobj.group(1)
3198 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3199 self.report_extraction(video_id)
3200 request = compat_urllib_request.Request(videogen_url)
3202 metadataXml = compat_urllib_request.urlopen(request).read()
3203 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3204 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3207 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3208 renditions = mdoc.findall('.//rendition')
3210 # For now, always pick the highest quality.
3211 rendition = renditions[-1]
3214 _,_,ext = rendition.attrib['type'].partition('/')
3215 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3216 video_url = rendition.find('./src').text
3218 self._downloader.trouble('Invalid rendition field.')
3224 'uploader': performer,
3225 'upload_date': None,
3226 'title': video_title,
3234 class YoukuIE(InfoExtractor):
3235 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3237 def report_download_webpage(self, file_id):
3238 """Report webpage download."""
3239 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3241 def report_extraction(self, file_id):
3242 """Report information extraction."""
3243 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3246 nowTime = int(time.time() * 1000)
3247 random1 = random.randint(1000,1998)
3248 random2 = random.randint(1000,9999)
3250 return "%d%d%d" %(nowTime,random1,random2)
3252 def _get_file_ID_mix_string(self, seed):
3254 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3256 for i in range(len(source)):
3257 seed = (seed * 211 + 30031 ) % 65536
3258 index = math.floor(seed / 65536 * len(source) )
3259 mixed.append(source[int(index)])
3260 source.remove(source[int(index)])
3261 #return ''.join(mixed)
3264 def _get_file_id(self, fileId, seed):
3265 mixed = self._get_file_ID_mix_string(seed)
3266 ids = fileId.split('*')
3270 realId.append(mixed[int(ch)])
3271 return ''.join(realId)
3273 def _real_extract(self, url):
3274 mobj = re.match(self._VALID_URL, url)
3276 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3278 video_id = mobj.group('ID')
3280 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3282 request = compat_urllib_request.Request(info_url, None, std_headers)
3284 self.report_download_webpage(video_id)
3285 jsondata = compat_urllib_request.urlopen(request).read()
3286 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3287 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3290 self.report_extraction(video_id)
3292 jsonstr = jsondata.decode('utf-8')
3293 config = json.loads(jsonstr)
3295 video_title = config['data'][0]['title']
3296 seed = config['data'][0]['seed']
3298 format = self._downloader.params.get('format', None)
3299 supported_format = list(config['data'][0]['streamfileids'].keys())
3301 if format is None or format == 'best':
3302 if 'hd2' in supported_format:
3307 elif format == 'worst':
3315 fileid = config['data'][0]['streamfileids'][format]
3316 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3317 except (UnicodeDecodeError, ValueError, KeyError):
3318 self._downloader.trouble(u'ERROR: unable to extract info section')
3322 sid = self._gen_sid()
3323 fileid = self._get_file_id(fileid, seed)
3325 #column 8,9 of fileid represent the segment number
3326 #fileid[7:9] should be changed
3327 for index, key in enumerate(keys):
3329 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3330 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3333 'id': '%s_part%02d' % (video_id, index),
3334 'url': download_url,
3336 'upload_date': None,
3337 'title': video_title,
3340 files_info.append(info)
3345 class XNXXIE(InfoExtractor):
3346 """Information extractor for xnxx.com"""
3348 _VALID_URL = r'^http://video\.xnxx\.com/video([0-9]+)/(.*)'
3350 VIDEO_URL_RE = r'flv_url=(.*?)&'
3351 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3352 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3354 def report_webpage(self, video_id):
3355 """Report information extraction"""
3356 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3358 def report_extraction(self, video_id):
3359 """Report information extraction"""
3360 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3362 def _real_extract(self, url):
3363 mobj = re.match(self._VALID_URL, url)
3365 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3367 video_id = mobj.group(1)
3369 self.report_webpage(video_id)
3371 # Get webpage content
3373 webpage_bytes = compat_urllib_request.urlopen(url).read()
3374 webpage = webpage_bytes.decode('utf-8')
3375 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3376 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3379 result = re.search(self.VIDEO_URL_RE, webpage)
3381 self._downloader.trouble(u'ERROR: unable to extract video url')
3383 video_url = compat_urllib_parse.unquote(result.group(1))
3385 result = re.search(self.VIDEO_TITLE_RE, webpage)
3387 self._downloader.trouble(u'ERROR: unable to extract video title')
3389 video_title = result.group(1)
3391 result = re.search(self.VIDEO_THUMB_RE, webpage)
3393 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3395 video_thumbnail = result.group(1)
3401 'upload_date': None,
3402 'title': video_title,
3404 'thumbnail': video_thumbnail,
3405 'description': None,
3409 class GooglePlusIE(InfoExtractor):
3410 """Information extractor for plus.google.com."""
3412 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3413 IE_NAME = u'plus.google'
3415 def __init__(self, downloader=None):
3416 InfoExtractor.__init__(self, downloader)
3418 def report_extract_entry(self, url):
3419 """Report downloading extry"""
3420 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3422 def report_date(self, upload_date):
3423 """Report downloading extry"""
3424 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3426 def report_uploader(self, uploader):
3427 """Report downloading extry"""
3428 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3430 def report_title(self, video_title):
3431 """Report downloading extry"""
3432 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3434 def report_extract_vid_page(self, video_page):
3435 """Report information extraction."""
3436 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3438 def _real_extract(self, url):
3439 # Extract id from URL
3440 mobj = re.match(self._VALID_URL, url)
3442 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3445 post_url = mobj.group(0)
3446 video_id = mobj.group(1)
3448 video_extension = 'flv'
3450 # Step 1, Retrieve post webpage to extract further information
3451 self.report_extract_entry(post_url)
3452 request = compat_urllib_request.Request(post_url)
3454 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3455 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3456 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3459 # Extract update date
3461 pattern = 'title="Timestamp">(.*?)</a>'
3462 mobj = re.search(pattern, webpage)
3464 upload_date = mobj.group(1)
3465 # Convert timestring to a format suitable for filename
3466 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3467 upload_date = upload_date.strftime('%Y%m%d')
3468 self.report_date(upload_date)
3472 pattern = r'rel\="author".*?>(.*?)</a>'
3473 mobj = re.search(pattern, webpage)
3475 uploader = mobj.group(1)
3476 self.report_uploader(uploader)
3479 # Get the first line for title
3481 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3482 mobj = re.search(pattern, webpage)
3484 video_title = mobj.group(1)
3485 self.report_title(video_title)
3487 # Step 2, Stimulate clicking the image box to launch video
3488 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3489 mobj = re.search(pattern, webpage)
3491 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3493 video_page = mobj.group(1)
3494 request = compat_urllib_request.Request(video_page)
3496 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3497 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3498 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3500 self.report_extract_vid_page(video_page)
3503 # Extract video links on video page
3504 """Extract video links of all sizes"""
3505 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3506 mobj = re.findall(pattern, webpage)
3508 self._downloader.trouble(u'ERROR: unable to extract video links')
3510 # Sort in resolution
3511 links = sorted(mobj)
3513 # Choose the lowest of the sort, i.e. highest resolution
3514 video_url = links[-1]
3515 # Only get the url. The resolution part in the tuple has no use anymore
3516 video_url = video_url[-1]
3517 # Treat escaped \u0026 style hex
3519 video_url = video_url.decode("unicode_escape")
3520 except AttributeError: # Python 3
3521 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3527 'uploader': uploader,
3528 'upload_date': upload_date,
3529 'title': video_title,
3530 'ext': video_extension,
3533 class NBAIE(InfoExtractor):
3534 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3537 def report_extraction(self, video_id):
3538 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3540 def _real_extract(self, url):
3541 mobj = re.match(self._VALID_URL, url)
3543 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3546 video_id = mobj.group(1)
3547 if video_id.endswith('/index.html'):
3548 video_id = video_id[:-len('/index.html')]
3550 self.report_extraction(video_id)
3552 urlh = compat_urllib_request.urlopen(url)
3553 webpage_bytes = urlh.read()
3554 webpage = webpage_bytes.decode('utf-8', 'ignore')
3555 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3556 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
3559 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3560 def _findProp(rexp, default=None):
3561 m = re.search(rexp, webpage)
3563 return unescapeHTML(m.group(1))
3567 shortened_video_id = video_id.rpartition('/')[2]
3568 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3570 'id': shortened_video_id,
3574 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3575 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3579 class JustinTVIE(InfoExtractor):
3580 """Information extractor for justin.tv and twitch.tv"""
3581 # TODO: One broadcast may be split into multiple videos. The key
3582 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3583 # starts at 1 and increases. Can we treat all parts as one video?
3585 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3586 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3587 _JUSTIN_PAGE_LIMIT = 100
3588 IE_NAME = u'justin.tv'
3590 def report_extraction(self, file_id):
3591 """Report information extraction."""
3592 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3594 def report_download_page(self, channel, offset):
3595 """Report attempt to download a single page of videos."""
3596 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3597 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3599 # Return count of items, list of *valid* items
3600 def _parse_page(self, url):
3602 urlh = compat_urllib_request.urlopen(url)
3603 webpage_bytes = urlh.read()
3604 webpage = webpage_bytes.decode('utf-8', 'ignore')
3605 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3606 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3609 response = json.loads(webpage)
3611 for clip in response:
3612 video_url = clip['video_file_url']
3614 video_extension = os.path.splitext(video_url)[1][1:]
3615 video_date = re.sub('-', '', clip['created_on'][:10])
3619 'title': clip['title'],
3620 'uploader': clip.get('user_id', clip.get('channel_id')),
3621 'upload_date': video_date,
3622 'ext': video_extension,
3624 return (len(response), info)
3626 def _real_extract(self, url):
3627 mobj = re.match(self._VALID_URL, url)
3629 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3632 api = 'http://api.justin.tv'
3633 video_id = mobj.group(mobj.lastindex)
3635 if mobj.lastindex == 1:
3637 api += '/channel/archives/%s.json'
3639 api += '/clip/show/%s.json'
3640 api = api % (video_id,)
3642 self.report_extraction(video_id)
3646 limit = self._JUSTIN_PAGE_LIMIT
3649 self.report_download_page(video_id, offset)
3650 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3651 page_count, page_info = self._parse_page(page_url)
3652 info.extend(page_info)
3653 if not paged or page_count != limit:
3658 class FunnyOrDieIE(InfoExtractor):
3659 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3661 def report_extraction(self, video_id):
3662 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3664 def _real_extract(self, url):
3665 mobj = re.match(self._VALID_URL, url)
3667 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3670 video_id = mobj.group('id')
3671 self.report_extraction(video_id)
3673 urlh = compat_urllib_request.urlopen(url)
3674 webpage_bytes = urlh.read()
3675 webpage = webpage_bytes.decode('utf-8', 'ignore')
3676 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3677 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
3680 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3682 self._downloader.trouble(u'ERROR: unable to find video information')
3683 video_url = unescapeHTML(m.group('url'))
3685 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3687 self._downloader.trouble(u'Cannot find video title')
3688 title = unescapeHTML(m.group('title'))
3690 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3692 desc = unescapeHTML(m.group('desc'))
3701 'description': desc,
3705 class TweetReelIE(InfoExtractor):
3706 _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3708 def report_extraction(self, video_id):
3709 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3711 def _real_extract(self, url):
3712 mobj = re.match(self._VALID_URL, url)
3714 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3717 video_id = mobj.group('id')
3718 self.report_extraction(video_id)
3720 urlh = compat_urllib_request.urlopen(url)
3721 webpage_bytes = urlh.read()
3722 webpage = webpage_bytes.decode('utf-8', 'ignore')
3723 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3724 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
3727 m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3729 self._downloader.trouble(u'ERROR: Cannot find status ID')
3730 status_id = m.group(1)
3732 m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3734 self._downloader.trouble(u'WARNING: Cannot find description')
3735 desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3737 m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3739 self._downloader.trouble(u'ERROR: Cannot find uploader')
3740 uploader = unescapeHTML(m.group('uploader'))
3741 uploader_id = unescapeHTML(m.group('uploader_id'))
3743 m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3745 self._downloader.trouble(u'ERROR: Cannot find upload date')
3746 upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3749 video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3756 'description': desc,
3757 'uploader': uploader,
3758 'uploader_id': uploader_id,
3759 'internal_id': status_id,
3760 'upload_date': upload_date
3764 class SteamIE(InfoExtractor):
3765 _VALID_URL = r"""http://store.steampowered.com/
3766 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3768 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3771 def suitable(self, url):
3772 """Receives a URL and returns True if suitable for this IE."""
3773 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3775 def report_download_video_page(self, game_id):
3776 self._downloader.to_screen(u'[%s] %s: Downloading video page' % (self.IE_NAME, game_id))
3778 def _real_extract(self, url):
3779 m = re.match(self._VALID_URL, url, re.VERBOSE)
3780 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3781 gameID = m.group('gameID')
3782 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3784 self.report_download_video_page(gameID)
3785 urlh = compat_urllib_request.urlopen(videourl)
3786 webpage_bytes = urlh.read()
3787 webpage = webpage_bytes.decode('utf-8', 'ignore')
3788 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3789 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
3791 mweb = re.finditer(urlRE, webpage)
3792 namesRE = r'<span class=\"title\">(?P<videoName>[\w:/\.\?=\+\s-]+)</span>'
3793 titles = list(re.finditer(namesRE, webpage))
3797 video_id = vid.group('videoID')
3798 title = titles[i].group('videoName')
3799 video_url=vid.group('videoURL')
3801 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3812 class UstreamIE(InfoExtractor):
3813 _VALID_URL = r'http://www.ustream.tv/recorded/(?P<videoID>\d+)'
3814 IE_NAME = u'ustream'
3816 def _real_extract(self, url):
3817 m = re.match(self._VALID_URL, url)
3818 video_id = m.group('videoID')
3819 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3820 webpage = self._download_webpage(url, video_id)
3821 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3822 title = m.group('title')
3823 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3824 uploader = m.group('uploader')
3830 'uploader': uploader
3835 def gen_extractors():
3836 """ Return a list of an instance of every supported extractor.
3837 The order does matter; the first extractor matched is the one handling the URL.
3840 YoutubePlaylistIE(),
3864 StanfordOpenClassroomIE(),