2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The .srt file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 note = u'Downloading video webpage'
118 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
120 return compat_urllib_request.urlopen(url_or_request)
121 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123 errnote = u'Unable to download webpage'
124 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
126 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
127 """ Returns the data of the page as a string """
128 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
129 webpage_bytes = urlh.read()
130 return webpage_bytes.decode('utf-8', 'replace')
133 class YoutubeIE(InfoExtractor):
134 """Information extractor for youtube.com."""
138 (?:https?://)? # http(s):// (optional)
139 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
140 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
141 (?:.*?\#/)? # handle anchor (#/) redirect urls
142 (?: # the various things that can precede the ID:
143 (?:(?:v|embed|e)/) # v/ or embed/ or e/
144 |(?: # or the v= param in all its forms
145 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
146 (?:\?|\#!?) # the params delimiter ? or # or #!
147 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
150 )? # optional -> youtube.com/xxxx is OK
151 )? # all until now is optional -> you can pass the naked ID
152 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
153 (?(1).+)? # if we found the ID, everything can follow
155 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
156 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
157 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
158 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
159 _NETRC_MACHINE = 'youtube'
160 # Listed in order of quality
161 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
162 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
163 _video_extensions = {
169 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
175 _video_dimensions = {
194 def suitable(cls, url):
195 """Receives a URL and returns True if suitable for this IE."""
196 if YoutubePlaylistIE.suitable(url): return False
197 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
199 def report_lang(self):
200 """Report attempt to set language."""
201 self._downloader.to_screen(u'[youtube] Setting language')
203 def report_login(self):
204 """Report attempt to log in."""
205 self._downloader.to_screen(u'[youtube] Logging in')
207 def report_age_confirmation(self):
208 """Report attempt to confirm age."""
209 self._downloader.to_screen(u'[youtube] Confirming age')
211 def report_video_webpage_download(self, video_id):
212 """Report attempt to download video webpage."""
213 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
215 def report_video_info_webpage_download(self, video_id):
216 """Report attempt to download video info webpage."""
217 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
219 def report_video_subtitles_download(self, video_id):
220 """Report attempt to download video info webpage."""
221 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
223 def report_information_extraction(self, video_id):
224 """Report attempt to extract video information."""
225 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
227 def report_unavailable_format(self, video_id, format):
228 """Report extracted video URL."""
229 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
231 def report_rtmp_download(self):
232 """Indicate the download will use the RTMP protocol."""
233 self._downloader.to_screen(u'[youtube] RTMP download detected')
235 def _closed_captions_xml_to_srt(self, xml_string):
237 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
238 # TODO parse xml instead of regex
239 for n, (start, dur_tag, dur, caption) in enumerate(texts):
240 if not dur: dur = '4'
242 end = start + float(dur)
243 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
244 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
245 caption = unescapeHTML(caption)
246 caption = unescapeHTML(caption) # double cycle, intentional
247 srt += str(n+1) + '\n'
248 srt += start + ' --> ' + end + '\n'
249 srt += caption + '\n\n'
252 def _extract_subtitles(self, video_id):
253 self.report_video_subtitles_download(video_id)
254 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
256 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
257 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
258 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
259 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
260 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
261 if not srt_lang_list:
262 return (u'WARNING: video has no closed captions', None)
263 if self._downloader.params.get('subtitleslang', False):
264 srt_lang = self._downloader.params.get('subtitleslang')
265 elif 'en' in srt_lang_list:
268 srt_lang = list(srt_lang_list.keys())[0]
269 if not srt_lang in srt_lang_list:
270 return (u'WARNING: no closed captions found in the specified language', None)
271 params = compat_urllib_parse.urlencode({
273 'name': srt_lang_list[srt_lang].encode('utf-8'),
276 url = 'http://www.youtube.com/api/timedtext?' + params
278 srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8')
279 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
280 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
282 return (u'WARNING: Did not fetch video subtitles', None)
283 return (None, self._closed_captions_xml_to_srt(srt_xml))
285 def _print_formats(self, formats):
286 print('Available formats:')
288 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
290 def _real_initialize(self):
291 if self._downloader is None:
296 downloader_params = self._downloader.params
298 # Attempt to use provided username and password or .netrc data
299 if downloader_params.get('username', None) is not None:
300 username = downloader_params['username']
301 password = downloader_params['password']
302 elif downloader_params.get('usenetrc', False):
304 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
309 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
310 except (IOError, netrc.NetrcParseError) as err:
311 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
315 request = compat_urllib_request.Request(self._LANG_URL)
318 compat_urllib_request.urlopen(request).read()
319 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
320 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
323 # No authentication to be performed
327 request = compat_urllib_request.Request(self._LOGIN_URL)
329 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
330 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
331 self._downloader.to_stderr(u'WARNING: unable to fetch login page: %s' % compat_str(err))
336 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
338 galx = match.group(1)
340 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
346 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
350 u'PersistentCookie': u'yes',
352 u'bgresponse': u'js_disabled',
353 u'checkConnection': u'',
354 u'checkedDomains': u'youtube',
360 u'signIn': u'Sign in',
362 u'service': u'youtube',
366 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
368 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
369 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
370 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
373 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
374 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
375 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
377 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
378 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
384 'action_confirm': 'Confirm',
386 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
388 self.report_age_confirmation()
389 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
390 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
391 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
394 def _extract_id(self, url):
395 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
397 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
399 video_id = mobj.group(2)
402 def _real_extract(self, url):
403 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
404 mobj = re.search(self._NEXT_URL_RE, url)
406 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
407 video_id = self._extract_id(url)
410 self.report_video_webpage_download(video_id)
411 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
412 request = compat_urllib_request.Request(url)
414 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
415 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
416 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
419 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
421 # Attempt to extract SWF player URL
422 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
424 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
429 self.report_video_info_webpage_download(video_id)
430 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
431 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
432 % (video_id, el_type))
433 request = compat_urllib_request.Request(video_info_url)
435 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
436 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
437 video_info = compat_parse_qs(video_info_webpage)
438 if 'token' in video_info:
440 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
441 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
443 if 'token' not in video_info:
444 if 'reason' in video_info:
445 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
447 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
450 # Check for "rental" videos
451 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
452 self._downloader.trouble(u'ERROR: "rental" videos not supported')
455 # Start extracting information
456 self.report_information_extraction(video_id)
459 if 'author' not in video_info:
460 self._downloader.trouble(u'ERROR: unable to extract uploader name')
462 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
465 video_uploader_id = None
466 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
468 video_uploader_id = mobj.group(1)
470 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
473 if 'title' not in video_info:
474 self._downloader.trouble(u'ERROR: unable to extract video title')
476 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
479 if 'thumbnail_url' not in video_info:
480 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
482 else: # don't panic if we can't find it
483 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
487 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
489 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
490 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
491 for expression in format_expressions:
493 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
498 video_description = get_element_by_id("eow-description", video_webpage)
499 if video_description:
500 video_description = clean_html(video_description)
502 video_description = ''
505 video_subtitles = None
506 if self._downloader.params.get('writesubtitles', False):
507 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
509 self._downloader.trouble(srt_error)
511 if 'length_seconds' not in video_info:
512 self._downloader.trouble(u'WARNING: unable to extract video duration')
515 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
518 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
520 # Decide which formats to download
521 req_format = self._downloader.params.get('format', None)
523 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
524 self.report_rtmp_download()
525 video_url_list = [(None, video_info['conn'][0])]
526 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
527 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
528 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
529 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
530 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
532 format_limit = self._downloader.params.get('format_limit', None)
533 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
534 if format_limit is not None and format_limit in available_formats:
535 format_list = available_formats[available_formats.index(format_limit):]
537 format_list = available_formats
538 existing_formats = [x for x in format_list if x in url_map]
539 if len(existing_formats) == 0:
540 self._downloader.trouble(u'ERROR: no known formats available for video')
542 if self._downloader.params.get('listformats', None):
543 self._print_formats(existing_formats)
545 if req_format is None or req_format == 'best':
546 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
547 elif req_format == 'worst':
548 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
549 elif req_format in ('-1', 'all'):
550 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
552 # Specific formats. We pick the first in a slash-delimeted sequence.
553 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
554 req_formats = req_format.split('/')
555 video_url_list = None
556 for rf in req_formats:
558 video_url_list = [(rf, url_map[rf])]
560 if video_url_list is None:
561 self._downloader.trouble(u'ERROR: requested format not available')
564 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
568 for format_param, video_real_url in video_url_list:
570 video_extension = self._video_extensions.get(format_param, 'flv')
572 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
573 self._video_dimensions.get(format_param, '???'))
577 'url': video_real_url,
578 'uploader': video_uploader,
579 'uploader_id': video_uploader_id,
580 'upload_date': upload_date,
581 'title': video_title,
582 'ext': video_extension,
583 'format': video_format,
584 'thumbnail': video_thumbnail,
585 'description': video_description,
586 'player_url': player_url,
587 'subtitles': video_subtitles,
588 'duration': video_duration
593 class MetacafeIE(InfoExtractor):
594 """Information Extractor for metacafe.com."""
596 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
597 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
598 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
599 IE_NAME = u'metacafe'
601 def __init__(self, downloader=None):
602 InfoExtractor.__init__(self, downloader)
604 def report_disclaimer(self):
605 """Report disclaimer retrieval."""
606 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
608 def report_age_confirmation(self):
609 """Report attempt to confirm age."""
610 self._downloader.to_screen(u'[metacafe] Confirming age')
612 def report_download_webpage(self, video_id):
613 """Report webpage download."""
614 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
616 def report_extraction(self, video_id):
617 """Report information extraction."""
618 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
620 def _real_initialize(self):
621 # Retrieve disclaimer
622 request = compat_urllib_request.Request(self._DISCLAIMER)
624 self.report_disclaimer()
625 disclaimer = compat_urllib_request.urlopen(request).read()
626 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
627 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
633 'submit': "Continue - I'm over 18",
635 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
637 self.report_age_confirmation()
638 disclaimer = compat_urllib_request.urlopen(request).read()
639 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
640 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
643 def _real_extract(self, url):
644 # Extract id and simplified title from URL
645 mobj = re.match(self._VALID_URL, url)
647 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
650 video_id = mobj.group(1)
652 # Check if video comes from YouTube
653 mobj2 = re.match(r'^yt-(.*)$', video_id)
654 if mobj2 is not None:
655 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
658 # Retrieve video webpage to extract further information
659 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
661 self.report_download_webpage(video_id)
662 webpage = compat_urllib_request.urlopen(request).read()
663 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
664 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
667 # Extract URL, uploader and title from webpage
668 self.report_extraction(video_id)
669 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
671 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
672 video_extension = mediaURL[-3:]
674 # Extract gdaKey if available
675 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
679 gdaKey = mobj.group(1)
680 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
682 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
684 self._downloader.trouble(u'ERROR: unable to extract media URL')
686 vardict = compat_parse_qs(mobj.group(1))
687 if 'mediaData' not in vardict:
688 self._downloader.trouble(u'ERROR: unable to extract media URL')
690 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
692 self._downloader.trouble(u'ERROR: unable to extract media URL')
694 mediaURL = mobj.group(1).replace('\\/', '/')
695 video_extension = mediaURL[-3:]
696 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
698 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
700 self._downloader.trouble(u'ERROR: unable to extract title')
702 video_title = mobj.group(1).decode('utf-8')
704 mobj = re.search(r'submitter=(.*?);', webpage)
706 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
708 video_uploader = mobj.group(1)
711 'id': video_id.decode('utf-8'),
712 'url': video_url.decode('utf-8'),
713 'uploader': video_uploader.decode('utf-8'),
715 'title': video_title,
716 'ext': video_extension.decode('utf-8'),
720 class DailymotionIE(InfoExtractor):
721 """Information Extractor for Dailymotion"""
723 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
724 IE_NAME = u'dailymotion'
727 def __init__(self, downloader=None):
728 InfoExtractor.__init__(self, downloader)
730 def report_extraction(self, video_id):
731 """Report information extraction."""
732 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
734 def _real_extract(self, url):
735 # Extract id and simplified title from URL
736 mobj = re.match(self._VALID_URL, url)
738 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
741 video_id = mobj.group(1).split('_')[0].split('?')[0]
743 video_extension = 'mp4'
745 # Retrieve video webpage to extract further information
746 request = compat_urllib_request.Request(url)
747 request.add_header('Cookie', 'family_filter=off')
748 webpage = self._download_webpage(request, video_id)
750 # Extract URL, uploader and title from webpage
751 self.report_extraction(video_id)
752 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
754 self._downloader.trouble(u'ERROR: unable to extract media URL')
756 flashvars = compat_urllib_parse.unquote(mobj.group(1))
758 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
761 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
764 self._downloader.trouble(u'ERROR: unable to extract video URL')
767 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
769 self._downloader.trouble(u'ERROR: unable to extract video URL')
772 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
774 # TODO: support choosing qualities
776 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
778 self._downloader.trouble(u'ERROR: unable to extract title')
780 video_title = unescapeHTML(mobj.group('title'))
782 video_uploader = None
783 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
785 # lookin for official user
786 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
787 if mobj_official is None:
788 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
790 video_uploader = mobj_official.group(1)
792 video_uploader = mobj.group(1)
794 video_upload_date = None
795 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
797 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
802 'uploader': video_uploader,
803 'upload_date': video_upload_date,
804 'title': video_title,
805 'ext': video_extension,
809 class PhotobucketIE(InfoExtractor):
810 """Information extractor for photobucket.com."""
812 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
813 IE_NAME = u'photobucket'
815 def __init__(self, downloader=None):
816 InfoExtractor.__init__(self, downloader)
818 def report_download_webpage(self, video_id):
819 """Report webpage download."""
820 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
822 def report_extraction(self, video_id):
823 """Report information extraction."""
824 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
826 def _real_extract(self, url):
827 # Extract id from URL
828 mobj = re.match(self._VALID_URL, url)
830 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
833 video_id = mobj.group(1)
835 video_extension = 'flv'
837 # Retrieve video webpage to extract further information
838 request = compat_urllib_request.Request(url)
840 self.report_download_webpage(video_id)
841 webpage = compat_urllib_request.urlopen(request).read()
842 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
843 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
846 # Extract URL, uploader, and title from webpage
847 self.report_extraction(video_id)
848 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
850 self._downloader.trouble(u'ERROR: unable to extract media URL')
852 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
856 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
858 self._downloader.trouble(u'ERROR: unable to extract title')
860 video_title = mobj.group(1).decode('utf-8')
862 video_uploader = mobj.group(2).decode('utf-8')
865 'id': video_id.decode('utf-8'),
866 'url': video_url.decode('utf-8'),
867 'uploader': video_uploader,
869 'title': video_title,
870 'ext': video_extension.decode('utf-8'),
874 class YahooIE(InfoExtractor):
875 """Information extractor for video.yahoo.com."""
878 # _VALID_URL matches all Yahoo! Video URLs
879 # _VPAGE_URL matches only the extractable '/watch/' URLs
880 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
881 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
882 IE_NAME = u'video.yahoo'
884 def __init__(self, downloader=None):
885 InfoExtractor.__init__(self, downloader)
887 def report_download_webpage(self, video_id):
888 """Report webpage download."""
889 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
891 def report_extraction(self, video_id):
892 """Report information extraction."""
893 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
895 def _real_extract(self, url, new_video=True):
896 # Extract ID from URL
897 mobj = re.match(self._VALID_URL, url)
899 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
902 video_id = mobj.group(2)
903 video_extension = 'flv'
905 # Rewrite valid but non-extractable URLs as
906 # extractable English language /watch/ URLs
907 if re.match(self._VPAGE_URL, url) is None:
908 request = compat_urllib_request.Request(url)
910 webpage = compat_urllib_request.urlopen(request).read()
911 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
912 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
915 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
917 self._downloader.trouble(u'ERROR: Unable to extract id field')
919 yahoo_id = mobj.group(1)
921 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
923 self._downloader.trouble(u'ERROR: Unable to extract vid field')
925 yahoo_vid = mobj.group(1)
927 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
928 return self._real_extract(url, new_video=False)
930 # Retrieve video webpage to extract further information
931 request = compat_urllib_request.Request(url)
933 self.report_download_webpage(video_id)
934 webpage = compat_urllib_request.urlopen(request).read()
935 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
936 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
939 # Extract uploader and title from webpage
940 self.report_extraction(video_id)
941 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
943 self._downloader.trouble(u'ERROR: unable to extract video title')
945 video_title = mobj.group(1).decode('utf-8')
947 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
949 self._downloader.trouble(u'ERROR: unable to extract video uploader')
951 video_uploader = mobj.group(1).decode('utf-8')
953 # Extract video thumbnail
954 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
956 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
958 video_thumbnail = mobj.group(1).decode('utf-8')
960 # Extract video description
961 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
963 self._downloader.trouble(u'ERROR: unable to extract video description')
965 video_description = mobj.group(1).decode('utf-8')
966 if not video_description:
967 video_description = 'No description available.'
969 # Extract video height and width
970 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
972 self._downloader.trouble(u'ERROR: unable to extract video height')
974 yv_video_height = mobj.group(1)
976 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
978 self._downloader.trouble(u'ERROR: unable to extract video width')
980 yv_video_width = mobj.group(1)
982 # Retrieve video playlist to extract media URL
983 # I'm not completely sure what all these options are, but we
984 # seem to need most of them, otherwise the server sends a 401.
985 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
986 yv_bitrate = '700' # according to Wikipedia this is hard-coded
987 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
988 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
989 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
991 self.report_download_webpage(video_id)
992 webpage = compat_urllib_request.urlopen(request).read()
993 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
994 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
997 # Extract media URL from playlist XML
998 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1000 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1002 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1003 video_url = unescapeHTML(video_url)
1006 'id': video_id.decode('utf-8'),
1008 'uploader': video_uploader,
1009 'upload_date': None,
1010 'title': video_title,
1011 'ext': video_extension.decode('utf-8'),
1012 'thumbnail': video_thumbnail.decode('utf-8'),
1013 'description': video_description,
1017 class VimeoIE(InfoExtractor):
1018 """Information extractor for vimeo.com."""
1020 # _VALID_URL matches Vimeo URLs
1021 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1024 def __init__(self, downloader=None):
1025 InfoExtractor.__init__(self, downloader)
1027 def report_download_webpage(self, video_id):
1028 """Report webpage download."""
1029 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1031 def report_extraction(self, video_id):
1032 """Report information extraction."""
1033 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1035 def _real_extract(self, url, new_video=True):
1036 # Extract ID from URL
1037 mobj = re.match(self._VALID_URL, url)
1039 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1042 video_id = mobj.group('id')
1043 if not mobj.group('proto'):
1044 url = 'https://' + url
1045 if mobj.group('direct_link'):
1046 url = 'https://vimeo.com/' + video_id
1048 # Retrieve video webpage to extract further information
1049 request = compat_urllib_request.Request(url, None, std_headers)
1051 self.report_download_webpage(video_id)
1052 webpage_bytes = compat_urllib_request.urlopen(request).read()
1053 webpage = webpage_bytes.decode('utf-8')
1054 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1055 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1058 # Now we begin extracting as much information as we can from what we
1059 # retrieved. First we extract the information common to all extractors,
1060 # and latter we extract those that are Vimeo specific.
1061 self.report_extraction(video_id)
1063 # Extract the config JSON
1065 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1066 config = json.loads(config)
1068 self._downloader.trouble(u'ERROR: unable to extract info section')
1072 video_title = config["video"]["title"]
1074 # Extract uploader and uploader_id
1075 video_uploader = config["video"]["owner"]["name"]
1076 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1078 # Extract video thumbnail
1079 video_thumbnail = config["video"]["thumbnail"]
1081 # Extract video description
1082 video_description = get_element_by_attribute("itemprop", "description", webpage)
1083 if video_description: video_description = clean_html(video_description)
1084 else: video_description = ''
1086 # Extract upload date
1087 video_upload_date = None
1088 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1089 if mobj is not None:
1090 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1092 # Vimeo specific: extract request signature and timestamp
1093 sig = config['request']['signature']
1094 timestamp = config['request']['timestamp']
1096 # Vimeo specific: extract video codec and quality information
1097 # First consider quality, then codecs, then take everything
1098 # TODO bind to format param
1099 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1100 files = { 'hd': [], 'sd': [], 'other': []}
1101 for codec_name, codec_extension in codecs:
1102 if codec_name in config["video"]["files"]:
1103 if 'hd' in config["video"]["files"][codec_name]:
1104 files['hd'].append((codec_name, codec_extension, 'hd'))
1105 elif 'sd' in config["video"]["files"][codec_name]:
1106 files['sd'].append((codec_name, codec_extension, 'sd'))
1108 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1110 for quality in ('hd', 'sd', 'other'):
1111 if len(files[quality]) > 0:
1112 video_quality = files[quality][0][2]
1113 video_codec = files[quality][0][0]
1114 video_extension = files[quality][0][1]
1115 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1118 self._downloader.trouble(u'ERROR: no known codec found')
1121 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1122 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1127 'uploader': video_uploader,
1128 'uploader_id': video_uploader_id,
1129 'upload_date': video_upload_date,
1130 'title': video_title,
1131 'ext': video_extension,
1132 'thumbnail': video_thumbnail,
1133 'description': video_description,
1137 class ArteTvIE(InfoExtractor):
1138 """arte.tv information extractor."""
1140 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1141 _LIVE_URL = r'index-[0-9]+\.html$'
1143 IE_NAME = u'arte.tv'
1145 def __init__(self, downloader=None):
1146 InfoExtractor.__init__(self, downloader)
1148 def report_download_webpage(self, video_id):
1149 """Report webpage download."""
1150 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1152 def report_extraction(self, video_id):
1153 """Report information extraction."""
1154 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1156 def fetch_webpage(self, url):
1157 request = compat_urllib_request.Request(url)
1159 self.report_download_webpage(url)
1160 webpage = compat_urllib_request.urlopen(request).read()
1161 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1162 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1164 except ValueError as err:
1165 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1169 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1170 page = self.fetch_webpage(url)
1171 mobj = re.search(regex, page, regexFlags)
1175 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1178 for (i, key, err) in matchTuples:
1179 if mobj.group(i) is None:
1180 self._downloader.trouble(err)
1183 info[key] = mobj.group(i)
1187 def extractLiveStream(self, url):
1188 video_lang = url.split('/')[-4]
1189 info = self.grep_webpage(
1191 r'src="(.*?/videothek_js.*?\.js)',
1194 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1197 http_host = url.split('/')[2]
1198 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1199 info = self.grep_webpage(
1201 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1202 '(http://.*?\.swf).*?' +
1206 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1207 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1208 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1211 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1213 def extractPlus7Stream(self, url):
1214 video_lang = url.split('/')[-3]
1215 info = self.grep_webpage(
1217 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1220 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1223 next_url = compat_urllib_parse.unquote(info.get('url'))
1224 info = self.grep_webpage(
1226 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1229 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1232 next_url = compat_urllib_parse.unquote(info.get('url'))
1234 info = self.grep_webpage(
1236 r'<video id="(.*?)".*?>.*?' +
1237 '<name>(.*?)</name>.*?' +
1238 '<dateVideo>(.*?)</dateVideo>.*?' +
1239 '<url quality="hd">(.*?)</url>',
1242 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1243 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1244 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1245 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1250 'id': info.get('id'),
1251 'url': compat_urllib_parse.unquote(info.get('url')),
1252 'uploader': u'arte.tv',
1253 'upload_date': info.get('date'),
1254 'title': info.get('title').decode('utf-8'),
1260 def _real_extract(self, url):
1261 video_id = url.split('/')[-1]
1262 self.report_extraction(video_id)
1264 if re.search(self._LIVE_URL, video_id) is not None:
1265 self.extractLiveStream(url)
1268 info = self.extractPlus7Stream(url)
1273 class GenericIE(InfoExtractor):
1274 """Generic last-resort information extractor."""
1277 IE_NAME = u'generic'
1279 def __init__(self, downloader=None):
1280 InfoExtractor.__init__(self, downloader)
1282 def report_download_webpage(self, video_id):
1283 """Report webpage download."""
1284 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1285 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1287 def report_extraction(self, video_id):
1288 """Report information extraction."""
1289 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1291 def report_following_redirect(self, new_url):
1292 """Report information extraction."""
1293 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1295 def _test_redirect(self, url):
1296 """Check if it is a redirect, like url shorteners, in case restart chain."""
1297 class HeadRequest(compat_urllib_request.Request):
1298 def get_method(self):
1301 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1303 Subclass the HTTPRedirectHandler to make it use our
1304 HeadRequest also on the redirected URL
1306 def redirect_request(self, req, fp, code, msg, headers, newurl):
1307 if code in (301, 302, 303, 307):
1308 newurl = newurl.replace(' ', '%20')
1309 newheaders = dict((k,v) for k,v in req.headers.items()
1310 if k.lower() not in ("content-length", "content-type"))
1311 return HeadRequest(newurl,
1313 origin_req_host=req.get_origin_req_host(),
1316 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1318 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1320 Fallback to GET if HEAD is not allowed (405 HTTP error)
1322 def http_error_405(self, req, fp, code, msg, headers):
1326 newheaders = dict((k,v) for k,v in req.headers.items()
1327 if k.lower() not in ("content-length", "content-type"))
1328 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1330 origin_req_host=req.get_origin_req_host(),
1334 opener = compat_urllib_request.OpenerDirector()
1335 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1336 HTTPMethodFallback, HEADRedirectHandler,
1337 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1338 opener.add_handler(handler())
1340 response = opener.open(HeadRequest(url))
1341 new_url = response.geturl()
1346 self.report_following_redirect(new_url)
1347 self._downloader.download([new_url])
1350 def _real_extract(self, url):
1351 if self._test_redirect(url): return
1353 video_id = url.split('/')[-1]
1354 request = compat_urllib_request.Request(url)
1356 self.report_download_webpage(video_id)
1357 webpage = compat_urllib_request.urlopen(request).read()
1358 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1359 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1361 except ValueError as err:
1362 # since this is the last-resort InfoExtractor, if
1363 # this error is thrown, it'll be thrown here
1364 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1367 self.report_extraction(video_id)
1368 # Start with something easy: JW Player in SWFObject
1369 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1371 # Broaden the search a little bit
1372 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1374 # Broaden the search a little bit: JWPlayer JS loader
1375 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1377 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1380 # It's possible that one of the regexes
1381 # matched, but returned an empty group:
1382 if mobj.group(1) is None:
1383 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1386 video_url = compat_urllib_parse.unquote(mobj.group(1))
1387 video_id = os.path.basename(video_url)
1389 # here's a fun little line of code for you:
1390 video_extension = os.path.splitext(video_id)[1][1:]
1391 video_id = os.path.splitext(video_id)[0]
1393 # it's tempting to parse this further, but you would
1394 # have to take into account all the variations like
1395 # Video Title - Site Name
1396 # Site Name | Video Title
1397 # Video Title - Tagline | Site Name
1398 # and so on and so forth; it's just not practical
1399 mobj = re.search(r'<title>(.*)</title>', webpage)
1401 self._downloader.trouble(u'ERROR: unable to extract title')
1403 video_title = mobj.group(1)
1405 # video uploader is domain name
1406 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1408 self._downloader.trouble(u'ERROR: unable to extract title')
1410 video_uploader = mobj.group(1)
1415 'uploader': video_uploader,
1416 'upload_date': None,
1417 'title': video_title,
1418 'ext': video_extension,
1422 class YoutubeSearchIE(InfoExtractor):
1423 """Information Extractor for YouTube search queries."""
1424 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1425 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1426 _max_youtube_results = 1000
1427 IE_NAME = u'youtube:search'
1429 def __init__(self, downloader=None):
1430 InfoExtractor.__init__(self, downloader)
1432 def report_download_page(self, query, pagenum):
1433 """Report attempt to download search page with given number."""
1434 query = query.decode(preferredencoding())
1435 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1437 def _real_extract(self, query):
1438 mobj = re.match(self._VALID_URL, query)
1440 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1443 prefix, query = query.split(':')
1445 query = query.encode('utf-8')
1447 self._download_n_results(query, 1)
1449 elif prefix == 'all':
1450 self._download_n_results(query, self._max_youtube_results)
1456 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1458 elif n > self._max_youtube_results:
1459 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1460 n = self._max_youtube_results
1461 self._download_n_results(query, n)
1463 except ValueError: # parsing prefix as integer fails
1464 self._download_n_results(query, 1)
1467 def _download_n_results(self, query, n):
1468 """Downloads a specified number of results for a query"""
1474 while (50 * pagenum) < limit:
1475 self.report_download_page(query, pagenum+1)
1476 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1477 request = compat_urllib_request.Request(result_url)
1479 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1480 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1481 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1483 api_response = json.loads(data)['data']
1485 if not 'items' in api_response:
1486 self._downloader.trouble(u'[youtube] No video results')
1489 new_ids = list(video['id'] for video in api_response['items'])
1490 video_ids += new_ids
1492 limit = min(n, api_response['totalItems'])
1495 if len(video_ids) > n:
1496 video_ids = video_ids[:n]
1497 for id in video_ids:
1498 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1502 class GoogleSearchIE(InfoExtractor):
1503 """Information Extractor for Google Video search queries."""
1504 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1505 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1506 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1507 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1508 _max_google_results = 1000
1509 IE_NAME = u'video.google:search'
1511 def __init__(self, downloader=None):
1512 InfoExtractor.__init__(self, downloader)
1514 def report_download_page(self, query, pagenum):
1515 """Report attempt to download playlist page with given number."""
1516 query = query.decode(preferredencoding())
1517 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1519 def _real_extract(self, query):
1520 mobj = re.match(self._VALID_URL, query)
1522 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1525 prefix, query = query.split(':')
1527 query = query.encode('utf-8')
1529 self._download_n_results(query, 1)
1531 elif prefix == 'all':
1532 self._download_n_results(query, self._max_google_results)
1538 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1540 elif n > self._max_google_results:
1541 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1542 n = self._max_google_results
1543 self._download_n_results(query, n)
1545 except ValueError: # parsing prefix as integer fails
1546 self._download_n_results(query, 1)
1549 def _download_n_results(self, query, n):
1550 """Downloads a specified number of results for a query"""
1556 self.report_download_page(query, pagenum)
1557 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1558 request = compat_urllib_request.Request(result_url)
1560 page = compat_urllib_request.urlopen(request).read()
1561 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1562 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1565 # Extract video identifiers
1566 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1567 video_id = mobj.group(1)
1568 if video_id not in video_ids:
1569 video_ids.append(video_id)
1570 if len(video_ids) == n:
1571 # Specified n videos reached
1572 for id in video_ids:
1573 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1576 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1577 for id in video_ids:
1578 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1581 pagenum = pagenum + 1
1584 class YahooSearchIE(InfoExtractor):
1585 """Information Extractor for Yahoo! Video search queries."""
1588 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1589 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1590 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1591 _MORE_PAGES_INDICATOR = r'\s*Next'
1592 _max_yahoo_results = 1000
1593 IE_NAME = u'video.yahoo:search'
1595 def __init__(self, downloader=None):
1596 InfoExtractor.__init__(self, downloader)
1598 def report_download_page(self, query, pagenum):
1599 """Report attempt to download playlist page with given number."""
1600 query = query.decode(preferredencoding())
1601 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1603 def _real_extract(self, query):
1604 mobj = re.match(self._VALID_URL, query)
1606 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1609 prefix, query = query.split(':')
1611 query = query.encode('utf-8')
1613 self._download_n_results(query, 1)
1615 elif prefix == 'all':
1616 self._download_n_results(query, self._max_yahoo_results)
1622 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1624 elif n > self._max_yahoo_results:
1625 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1626 n = self._max_yahoo_results
1627 self._download_n_results(query, n)
1629 except ValueError: # parsing prefix as integer fails
1630 self._download_n_results(query, 1)
1633 def _download_n_results(self, query, n):
1634 """Downloads a specified number of results for a query"""
1637 already_seen = set()
1641 self.report_download_page(query, pagenum)
1642 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1643 request = compat_urllib_request.Request(result_url)
1645 page = compat_urllib_request.urlopen(request).read()
1646 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1647 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1650 # Extract video identifiers
1651 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1652 video_id = mobj.group(1)
1653 if video_id not in already_seen:
1654 video_ids.append(video_id)
1655 already_seen.add(video_id)
1656 if len(video_ids) == n:
1657 # Specified n videos reached
1658 for id in video_ids:
1659 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1662 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1663 for id in video_ids:
1664 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1667 pagenum = pagenum + 1
1670 class YoutubePlaylistIE(InfoExtractor):
1671 """Information Extractor for YouTube playlists."""
1673 _VALID_URL = r"""(?:
1678 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1679 \? (?:.*?&)*? (?:p|a|list)=
1684 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1687 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1689 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1691 IE_NAME = u'youtube:playlist'
1693 def __init__(self, downloader=None):
1694 InfoExtractor.__init__(self, downloader)
1697 def suitable(cls, url):
1698 """Receives a URL and returns True if suitable for this IE."""
1699 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1701 def report_download_page(self, playlist_id, pagenum):
1702 """Report attempt to download playlist page with given number."""
1703 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1705 def _real_extract(self, url):
1706 # Extract playlist id
1707 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1709 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1712 # Download playlist videos from API
1713 playlist_id = mobj.group(1) or mobj.group(2)
1718 self.report_download_page(playlist_id, page_num)
1720 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1722 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1723 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1724 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1728 response = json.loads(page)
1729 except ValueError as err:
1730 self._downloader.trouble(u'ERROR: Invalid JSON in API response: ' + compat_str(err))
1733 if not 'feed' in response or not 'entry' in response['feed']:
1734 self._downloader.trouble(u'ERROR: Got a malformed response from YouTube API')
1736 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1737 for entry in response['feed']['entry']
1738 if 'content' in entry ]
1740 if len(response['feed']['entry']) < self._MAX_RESULTS:
1744 videos = map(operator.itemgetter(1), sorted(videos))
1748 playliststart = self._downloader.params.get('playliststart', 1) - 1
1749 playlistend = self._downloader.params.get('playlistend', -1)
1750 if playlistend == -1:
1751 videos = videos[playliststart:]
1753 videos = videos[playliststart:playlistend]
1755 if len(videos) == total:
1756 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1758 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1760 for video in videos:
1761 self._downloader.download([video])
1765 class YoutubeChannelIE(InfoExtractor):
1766 """Information Extractor for YouTube channels."""
1768 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1769 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1770 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1771 IE_NAME = u'youtube:channel'
1773 def report_download_page(self, channel_id, pagenum):
1774 """Report attempt to download channel page with given number."""
1775 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1777 def _real_extract(self, url):
1778 # Extract channel id
1779 mobj = re.match(self._VALID_URL, url)
1781 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1784 # Download channel pages
1785 channel_id = mobj.group(1)
1790 self.report_download_page(channel_id, pagenum)
1791 url = self._TEMPLATE_URL % (channel_id, pagenum)
1792 request = compat_urllib_request.Request(url)
1794 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1795 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1796 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1799 # Extract video identifiers
1801 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1802 if mobj.group(1) not in ids_in_page:
1803 ids_in_page.append(mobj.group(1))
1804 video_ids.extend(ids_in_page)
1806 if self._MORE_PAGES_INDICATOR not in page:
1808 pagenum = pagenum + 1
1810 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1812 for id in video_ids:
1813 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1817 class YoutubeUserIE(InfoExtractor):
1818 """Information Extractor for YouTube users."""
1820 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1821 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1822 _GDATA_PAGE_SIZE = 50
1823 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1824 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1825 IE_NAME = u'youtube:user'
1827 def __init__(self, downloader=None):
1828 InfoExtractor.__init__(self, downloader)
1830 def report_download_page(self, username, start_index):
1831 """Report attempt to download user page."""
1832 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1833 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1835 def _real_extract(self, url):
1837 mobj = re.match(self._VALID_URL, url)
1839 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1842 username = mobj.group(1)
1844 # Download video ids using YouTube Data API. Result size per
1845 # query is limited (currently to 50 videos) so we need to query
1846 # page by page until there are no video ids - it means we got
1853 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1854 self.report_download_page(username, start_index)
1856 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1859 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1860 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1861 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1864 # Extract video identifiers
1867 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1868 if mobj.group(1) not in ids_in_page:
1869 ids_in_page.append(mobj.group(1))
1871 video_ids.extend(ids_in_page)
1873 # A little optimization - if current page is not
1874 # "full", ie. does not contain PAGE_SIZE video ids then
1875 # we can assume that this page is the last one - there
1876 # are no more ids on further pages - no need to query
1879 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1884 all_ids_count = len(video_ids)
1885 playliststart = self._downloader.params.get('playliststart', 1) - 1
1886 playlistend = self._downloader.params.get('playlistend', -1)
1888 if playlistend == -1:
1889 video_ids = video_ids[playliststart:]
1891 video_ids = video_ids[playliststart:playlistend]
1893 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1894 (username, all_ids_count, len(video_ids)))
1896 for video_id in video_ids:
1897 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1900 class BlipTVUserIE(InfoExtractor):
1901 """Information Extractor for blip.tv users."""
1903 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1905 IE_NAME = u'blip.tv:user'
1907 def __init__(self, downloader=None):
1908 InfoExtractor.__init__(self, downloader)
1910 def report_download_page(self, username, pagenum):
1911 """Report attempt to download user page."""
1912 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1913 (self.IE_NAME, username, pagenum))
1915 def _real_extract(self, url):
1917 mobj = re.match(self._VALID_URL, url)
1919 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1922 username = mobj.group(1)
1924 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1926 request = compat_urllib_request.Request(url)
1929 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1930 mobj = re.search(r'data-users-id="([^"]+)"', page)
1931 page_base = page_base % mobj.group(1)
1932 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1933 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1937 # Download video ids using BlipTV Ajax calls. Result size per
1938 # query is limited (currently to 12 videos) so we need to query
1939 # page by page until there are no video ids - it means we got
1946 self.report_download_page(username, pagenum)
1947 url = page_base + "&page=" + str(pagenum)
1948 request = compat_urllib_request.Request( url )
1950 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1951 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1952 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1955 # Extract video identifiers
1958 for mobj in re.finditer(r'href="/([^"]+)"', page):
1959 if mobj.group(1) not in ids_in_page:
1960 ids_in_page.append(unescapeHTML(mobj.group(1)))
1962 video_ids.extend(ids_in_page)
1964 # A little optimization - if current page is not
1965 # "full", ie. does not contain PAGE_SIZE video ids then
1966 # we can assume that this page is the last one - there
1967 # are no more ids on further pages - no need to query
1970 if len(ids_in_page) < self._PAGE_SIZE:
1975 all_ids_count = len(video_ids)
1976 playliststart = self._downloader.params.get('playliststart', 1) - 1
1977 playlistend = self._downloader.params.get('playlistend', -1)
1979 if playlistend == -1:
1980 video_ids = video_ids[playliststart:]
1982 video_ids = video_ids[playliststart:playlistend]
1984 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1985 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1987 for video_id in video_ids:
1988 self._downloader.download([u'http://blip.tv/'+video_id])
1991 class DepositFilesIE(InfoExtractor):
1992 """Information extractor for depositfiles.com"""
1994 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1996 def report_download_webpage(self, file_id):
1997 """Report webpage download."""
1998 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
2000 def report_extraction(self, file_id):
2001 """Report information extraction."""
2002 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
2004 def _real_extract(self, url):
2005 file_id = url.split('/')[-1]
2006 # Rebuild url in english locale
2007 url = 'http://depositfiles.com/en/files/' + file_id
2009 # Retrieve file webpage with 'Free download' button pressed
2010 free_download_indication = { 'gateway_result' : '1' }
2011 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2013 self.report_download_webpage(file_id)
2014 webpage = compat_urllib_request.urlopen(request).read()
2015 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2016 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
2019 # Search for the real file URL
2020 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2021 if (mobj is None) or (mobj.group(1) is None):
2022 # Try to figure out reason of the error.
2023 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2024 if (mobj is not None) and (mobj.group(1) is not None):
2025 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2026 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2028 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2031 file_url = mobj.group(1)
2032 file_extension = os.path.splitext(file_url)[1][1:]
2034 # Search for file title
2035 mobj = re.search(r'<b title="(.*?)">', webpage)
2037 self._downloader.trouble(u'ERROR: unable to extract title')
2039 file_title = mobj.group(1).decode('utf-8')
2042 'id': file_id.decode('utf-8'),
2043 'url': file_url.decode('utf-8'),
2045 'upload_date': None,
2046 'title': file_title,
2047 'ext': file_extension.decode('utf-8'),
2051 class FacebookIE(InfoExtractor):
2052 """Information Extractor for Facebook"""
2054 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2055 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2056 _NETRC_MACHINE = 'facebook'
2057 IE_NAME = u'facebook'
2059 def report_login(self):
2060 """Report attempt to log in."""
2061 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2063 def _real_initialize(self):
2064 if self._downloader is None:
2069 downloader_params = self._downloader.params
2071 # Attempt to use provided username and password or .netrc data
2072 if downloader_params.get('username', None) is not None:
2073 useremail = downloader_params['username']
2074 password = downloader_params['password']
2075 elif downloader_params.get('usenetrc', False):
2077 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2078 if info is not None:
2082 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2083 except (IOError, netrc.NetrcParseError) as err:
2084 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2087 if useremail is None:
2096 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2099 login_results = compat_urllib_request.urlopen(request).read()
2100 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2101 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2103 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2104 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2107 def _real_extract(self, url):
2108 mobj = re.match(self._VALID_URL, url)
2110 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2112 video_id = mobj.group('ID')
2114 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2115 webpage = self._download_webpage(url, video_id)
2117 BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2118 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2119 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2121 raise ExtractorError(u'Cannot parse data')
2122 data = dict(json.loads(m.group(1)))
2123 params_raw = compat_urllib_parse.unquote(data['params'])
2124 params = json.loads(params_raw)
2125 video_url = params['hd_src']
2127 video_url = params['sd_src']
2129 raise ExtractorError(u'Cannot find video URL')
2130 video_duration = int(params['video_duration'])
2132 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2134 raise ExtractorError(u'Cannot find title in webpage')
2135 video_title = unescapeHTML(m.group(1))
2139 'title': video_title,
2142 'duration': video_duration,
2143 'thumbnail': params['thumbnail_src'],
2148 class BlipTVIE(InfoExtractor):
2149 """Information extractor for blip.tv"""
2151 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2152 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2153 IE_NAME = u'blip.tv'
2155 def report_extraction(self, file_id):
2156 """Report information extraction."""
2157 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2159 def report_direct_download(self, title):
2160 """Report information extraction."""
2161 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2163 def _real_extract(self, url):
2164 mobj = re.match(self._VALID_URL, url)
2166 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2173 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2174 request = compat_urllib_request.Request(json_url)
2175 request.add_header('User-Agent', 'iTunes/10.6.1')
2176 self.report_extraction(mobj.group(1))
2179 urlh = compat_urllib_request.urlopen(request)
2180 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2181 basename = url.split('/')[-1]
2182 title,ext = os.path.splitext(basename)
2183 title = title.decode('UTF-8')
2184 ext = ext.replace('.', '')
2185 self.report_direct_download(title)
2190 'upload_date': None,
2195 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2196 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2197 if info is None: # Regular URL
2199 json_code_bytes = urlh.read()
2200 json_code = json_code_bytes.decode('utf-8')
2201 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2202 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2206 json_data = json.loads(json_code)
2207 if 'Post' in json_data:
2208 data = json_data['Post']
2212 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2213 video_url = data['media']['url']
2214 umobj = re.match(self._URL_EXT, video_url)
2216 raise ValueError('Can not determine filename extension')
2217 ext = umobj.group(1)
2220 'id': data['item_id'],
2222 'uploader': data['display_name'],
2223 'upload_date': upload_date,
2224 'title': data['title'],
2226 'format': data['media']['mimeType'],
2227 'thumbnail': data['thumbnailUrl'],
2228 'description': data['description'],
2229 'player_url': data['embedUrl'],
2230 'user_agent': 'iTunes/10.6.1',
2232 except (ValueError,KeyError) as err:
2233 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2239 class MyVideoIE(InfoExtractor):
2240 """Information Extractor for myvideo.de."""
2242 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2243 IE_NAME = u'myvideo'
2245 def __init__(self, downloader=None):
2246 InfoExtractor.__init__(self, downloader)
2248 def report_extraction(self, video_id):
2249 """Report information extraction."""
2250 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2252 def _real_extract(self,url):
2253 mobj = re.match(self._VALID_URL, url)
2255 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2258 video_id = mobj.group(1)
2261 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2262 webpage = self._download_webpage(webpage_url, video_id)
2264 self.report_extraction(video_id)
2265 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
2268 self._downloader.trouble(u'ERROR: unable to extract media URL')
2270 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2272 mobj = re.search('<title>([^<]+)</title>', webpage)
2274 self._downloader.trouble(u'ERROR: unable to extract title')
2277 video_title = mobj.group(1)
2283 'upload_date': None,
2284 'title': video_title,
2288 class ComedyCentralIE(InfoExtractor):
2289 """Information extractor for The Daily Show and Colbert Report """
2291 # urls can be abbreviations like :thedailyshow or :colbert
2292 # urls for episodes like:
2293 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2294 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2295 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2296 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2297 |(https?://)?(www\.)?
2298 (?P<showname>thedailyshow|colbertnation)\.com/
2299 (full-episodes/(?P<episode>.*)|
2301 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2302 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2305 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2307 _video_extensions = {
2315 _video_dimensions = {
2325 def suitable(cls, url):
2326 """Receives a URL and returns True if suitable for this IE."""
2327 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2329 def report_extraction(self, episode_id):
2330 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2332 def report_config_download(self, episode_id, media_id):
2333 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2335 def report_index_download(self, episode_id):
2336 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2338 def _print_formats(self, formats):
2339 print('Available formats:')
2341 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2344 def _real_extract(self, url):
2345 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2347 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2350 if mobj.group('shortname'):
2351 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2352 url = u'http://www.thedailyshow.com/full-episodes/'
2354 url = u'http://www.colbertnation.com/full-episodes/'
2355 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2356 assert mobj is not None
2358 if mobj.group('clip'):
2359 if mobj.group('showname') == 'thedailyshow':
2360 epTitle = mobj.group('tdstitle')
2362 epTitle = mobj.group('cntitle')
2365 dlNewest = not mobj.group('episode')
2367 epTitle = mobj.group('showname')
2369 epTitle = mobj.group('episode')
2371 req = compat_urllib_request.Request(url)
2372 self.report_extraction(epTitle)
2374 htmlHandle = compat_urllib_request.urlopen(req)
2375 html = htmlHandle.read()
2376 webpage = html.decode('utf-8')
2377 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2378 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2381 url = htmlHandle.geturl()
2382 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2384 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2386 if mobj.group('episode') == '':
2387 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2389 epTitle = mobj.group('episode')
2391 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2393 if len(mMovieParams) == 0:
2394 # The Colbert Report embeds the information in a without
2395 # a URL prefix; so extract the alternate reference
2396 # and then add the URL prefix manually.
2398 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2399 if len(altMovieParams) == 0:
2400 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2403 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2405 uri = mMovieParams[0][1]
2406 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2407 self.report_index_download(epTitle)
2409 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2410 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2411 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2416 idoc = xml.etree.ElementTree.fromstring(indexXml)
2417 itemEls = idoc.findall('.//item')
2418 for partNum,itemEl in enumerate(itemEls):
2419 mediaId = itemEl.findall('./guid')[0].text
2420 shortMediaId = mediaId.split(':')[-1]
2421 showId = mediaId.split(':')[-2].replace('.com', '')
2422 officialTitle = itemEl.findall('./title')[0].text
2423 officialDate = itemEl.findall('./pubDate')[0].text
2425 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2426 compat_urllib_parse.urlencode({'uri': mediaId}))
2427 configReq = compat_urllib_request.Request(configUrl)
2428 self.report_config_download(epTitle, shortMediaId)
2430 configXml = compat_urllib_request.urlopen(configReq).read()
2431 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2432 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2435 cdoc = xml.etree.ElementTree.fromstring(configXml)
2437 for rendition in cdoc.findall('.//rendition'):
2438 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2442 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2445 if self._downloader.params.get('listformats', None):
2446 self._print_formats([i[0] for i in turls])
2449 # For now, just pick the highest bitrate
2450 format,rtmp_video_url = turls[-1]
2452 # Get the format arg from the arg stream
2453 req_format = self._downloader.params.get('format', None)
2455 # Select format if we can find one
2458 format, rtmp_video_url = f, v
2461 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2463 raise ExtractorError(u'Cannot transform RTMP url')
2464 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2465 video_url = base + m.group('finalid')
2467 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2472 'upload_date': officialDate,
2477 'description': officialTitle,
2479 results.append(info)
2484 class EscapistIE(InfoExtractor):
2485 """Information extractor for The Escapist """
2487 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2488 IE_NAME = u'escapist'
2490 def report_extraction(self, showName):
2491 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2493 def report_config_download(self, showName):
2494 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2496 def _real_extract(self, url):
2497 mobj = re.match(self._VALID_URL, url)
2499 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2501 showName = mobj.group('showname')
2502 videoId = mobj.group('episode')
2504 self.report_extraction(showName)
2506 webPage = compat_urllib_request.urlopen(url)
2507 webPageBytes = webPage.read()
2508 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2509 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2510 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2511 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2514 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2515 description = unescapeHTML(descMatch.group(1))
2516 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2517 imgUrl = unescapeHTML(imgMatch.group(1))
2518 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2519 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2520 configUrlMatch = re.search('config=(.*)$', playerUrl)
2521 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2523 self.report_config_download(showName)
2525 configJSON = compat_urllib_request.urlopen(configUrl)
2526 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2527 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2528 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2529 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2532 # Technically, it's JavaScript, not JSON
2533 configJSON = configJSON.replace("'", '"')
2536 config = json.loads(configJSON)
2537 except (ValueError,) as err:
2538 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2541 playlist = config['playlist']
2542 videoUrl = playlist[1]['url']
2547 'uploader': showName,
2548 'upload_date': None,
2551 'thumbnail': imgUrl,
2552 'description': description,
2553 'player_url': playerUrl,
2558 class CollegeHumorIE(InfoExtractor):
2559 """Information extractor for collegehumor.com"""
2562 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2563 IE_NAME = u'collegehumor'
2565 def report_manifest(self, video_id):
2566 """Report information extraction."""
2567 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2569 def report_extraction(self, video_id):
2570 """Report information extraction."""
2571 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2573 def _real_extract(self, url):
2574 mobj = re.match(self._VALID_URL, url)
2576 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2578 video_id = mobj.group('videoid')
2583 'upload_date': None,
2586 self.report_extraction(video_id)
2587 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2589 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2590 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2591 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2594 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2596 videoNode = mdoc.findall('./video')[0]
2597 info['description'] = videoNode.findall('./description')[0].text
2598 info['title'] = videoNode.findall('./caption')[0].text
2599 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2600 manifest_url = videoNode.findall('./file')[0].text
2602 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2605 manifest_url += '?hdcore=2.10.3'
2606 self.report_manifest(video_id)
2608 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2609 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2610 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2613 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2615 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2616 node_id = media_node.attrib['url']
2617 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2618 except IndexError as err:
2619 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2622 url_pr = compat_urllib_parse_urlparse(manifest_url)
2623 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2630 class XVideosIE(InfoExtractor):
2631 """Information extractor for xvideos.com"""
2633 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2634 IE_NAME = u'xvideos'
2636 def report_extraction(self, video_id):
2637 """Report information extraction."""
2638 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2640 def _real_extract(self, url):
2641 mobj = re.match(self._VALID_URL, url)
2643 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2645 video_id = mobj.group(1)
2647 webpage = self._download_webpage(url, video_id)
2649 self.report_extraction(video_id)
2653 mobj = re.search(r'flv_url=(.+?)&', webpage)
2655 self._downloader.trouble(u'ERROR: unable to extract video url')
2657 video_url = compat_urllib_parse.unquote(mobj.group(1))
2661 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2663 self._downloader.trouble(u'ERROR: unable to extract video title')
2665 video_title = mobj.group(1)
2668 # Extract video thumbnail
2669 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2671 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2673 video_thumbnail = mobj.group(0)
2679 'upload_date': None,
2680 'title': video_title,
2682 'thumbnail': video_thumbnail,
2683 'description': None,
2689 class SoundcloudIE(InfoExtractor):
2690 """Information extractor for soundcloud.com
2691 To access the media, the uid of the song and a stream token
2692 must be extracted from the page source and the script must make
2693 a request to media.soundcloud.com/crossdomain.xml. Then
2694 the media can be grabbed by requesting from an url composed
2695 of the stream token and uid
2698 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2699 IE_NAME = u'soundcloud'
2701 def __init__(self, downloader=None):
2702 InfoExtractor.__init__(self, downloader)
2704 def report_resolve(self, video_id):
2705 """Report information extraction."""
2706 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2708 def report_extraction(self, video_id):
2709 """Report information extraction."""
2710 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2712 def _real_extract(self, url):
2713 mobj = re.match(self._VALID_URL, url)
2715 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2718 # extract uploader (which is in the url)
2719 uploader = mobj.group(1)
2720 # extract simple title (uploader + slug of song title)
2721 slug_title = mobj.group(2)
2722 simple_title = uploader + u'-' + slug_title
2724 self.report_resolve('%s/%s' % (uploader, slug_title))
2726 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2727 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2728 request = compat_urllib_request.Request(resolv_url)
2730 info_json_bytes = compat_urllib_request.urlopen(request).read()
2731 info_json = info_json_bytes.decode('utf-8')
2732 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2733 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2736 info = json.loads(info_json)
2737 video_id = info['id']
2738 self.report_extraction('%s/%s' % (uploader, slug_title))
2740 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2741 request = compat_urllib_request.Request(streams_url)
2743 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2744 stream_json = stream_json_bytes.decode('utf-8')
2745 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2746 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2749 streams = json.loads(stream_json)
2750 mediaURL = streams['http_mp3_128_url']
2755 'uploader': info['user']['username'],
2756 'upload_date': info['created_at'],
2757 'title': info['title'],
2759 'description': info['description'],
2763 class InfoQIE(InfoExtractor):
2764 """Information extractor for infoq.com"""
2765 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2767 def report_extraction(self, video_id):
2768 """Report information extraction."""
2769 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2771 def _real_extract(self, url):
2772 mobj = re.match(self._VALID_URL, url)
2774 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2777 webpage = self._download_webpage(url, video_id=url)
2778 self.report_extraction(url)
2781 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2783 self._downloader.trouble(u'ERROR: unable to extract video url')
2785 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2786 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2789 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2791 self._downloader.trouble(u'ERROR: unable to extract video title')
2793 video_title = mobj.group(1)
2795 # Extract description
2796 video_description = u'No description available.'
2797 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2798 if mobj is not None:
2799 video_description = mobj.group(1)
2801 video_filename = video_url.split('/')[-1]
2802 video_id, extension = video_filename.split('.')
2808 'upload_date': None,
2809 'title': video_title,
2810 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2812 'description': video_description,
2817 class MixcloudIE(InfoExtractor):
2818 """Information extractor for www.mixcloud.com"""
2820 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2821 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2822 IE_NAME = u'mixcloud'
2824 def __init__(self, downloader=None):
2825 InfoExtractor.__init__(self, downloader)
2827 def report_download_json(self, file_id):
2828 """Report JSON download."""
2829 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2831 def report_extraction(self, file_id):
2832 """Report information extraction."""
2833 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2835 def get_urls(self, jsonData, fmt, bitrate='best'):
2836 """Get urls from 'audio_formats' section in json"""
2839 bitrate_list = jsonData[fmt]
2840 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2841 bitrate = max(bitrate_list) # select highest
2843 url_list = jsonData[fmt][bitrate]
2844 except TypeError: # we have no bitrate info.
2845 url_list = jsonData[fmt]
2848 def check_urls(self, url_list):
2849 """Returns 1st active url from list"""
2850 for url in url_list:
2852 compat_urllib_request.urlopen(url)
2854 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2859 def _print_formats(self, formats):
2860 print('Available formats:')
2861 for fmt in formats.keys():
2862 for b in formats[fmt]:
2864 ext = formats[fmt][b][0]
2865 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2866 except TypeError: # we have no bitrate info
2867 ext = formats[fmt][0]
2868 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2871 def _real_extract(self, url):
2872 mobj = re.match(self._VALID_URL, url)
2874 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2876 # extract uploader & filename from url
2877 uploader = mobj.group(1).decode('utf-8')
2878 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2880 # construct API request
2881 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2882 # retrieve .json file with links to files
2883 request = compat_urllib_request.Request(file_url)
2885 self.report_download_json(file_url)
2886 jsonData = compat_urllib_request.urlopen(request).read()
2887 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2888 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2892 json_data = json.loads(jsonData)
2893 player_url = json_data['player_swf_url']
2894 formats = dict(json_data['audio_formats'])
2896 req_format = self._downloader.params.get('format', None)
2899 if self._downloader.params.get('listformats', None):
2900 self._print_formats(formats)
2903 if req_format is None or req_format == 'best':
2904 for format_param in formats.keys():
2905 url_list = self.get_urls(formats, format_param)
2907 file_url = self.check_urls(url_list)
2908 if file_url is not None:
2911 if req_format not in formats:
2912 self._downloader.trouble(u'ERROR: format is not available')
2915 url_list = self.get_urls(formats, req_format)
2916 file_url = self.check_urls(url_list)
2917 format_param = req_format
2920 'id': file_id.decode('utf-8'),
2921 'url': file_url.decode('utf-8'),
2922 'uploader': uploader.decode('utf-8'),
2923 'upload_date': None,
2924 'title': json_data['name'],
2925 'ext': file_url.split('.')[-1].decode('utf-8'),
2926 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2927 'thumbnail': json_data['thumbnail_url'],
2928 'description': json_data['description'],
2929 'player_url': player_url.decode('utf-8'),
2932 class StanfordOpenClassroomIE(InfoExtractor):
2933 """Information extractor for Stanford's Open ClassRoom"""
2935 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2936 IE_NAME = u'stanfordoc'
2938 def report_download_webpage(self, objid):
2939 """Report information extraction."""
2940 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2942 def report_extraction(self, video_id):
2943 """Report information extraction."""
2944 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2946 def _real_extract(self, url):
2947 mobj = re.match(self._VALID_URL, url)
2949 raise ExtractorError(u'Invalid URL: %s' % url)
2951 if mobj.group('course') and mobj.group('video'): # A specific video
2952 course = mobj.group('course')
2953 video = mobj.group('video')
2955 'id': course + '_' + video,
2957 'upload_date': None,
2960 self.report_extraction(info['id'])
2961 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2962 xmlUrl = baseUrl + video + '.xml'
2964 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2965 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2966 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2968 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2970 info['title'] = mdoc.findall('./title')[0].text
2971 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2973 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2975 info['ext'] = info['url'].rpartition('.')[2]
2977 elif mobj.group('course'): # A course page
2978 course = mobj.group('course')
2983 'upload_date': None,
2986 coursepage = self._download_webpage(url, info['id'],
2987 note='Downloading course info page',
2988 errnote='Unable to download course info page')
2990 m = re.search('<h1>([^<]+)</h1>', coursepage)
2992 info['title'] = unescapeHTML(m.group(1))
2994 info['title'] = info['id']
2996 m = re.search('<description>([^<]+)</description>', coursepage)
2998 info['description'] = unescapeHTML(m.group(1))
3000 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3003 'type': 'reference',
3004 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3008 for entry in info['list']:
3009 assert entry['type'] == 'reference'
3010 results += self.extract(entry['url'])
3014 'id': 'Stanford OpenClassroom',
3017 'upload_date': None,
3020 self.report_download_webpage(info['id'])
3021 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3023 rootpage = compat_urllib_request.urlopen(rootURL).read()
3024 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3025 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3028 info['title'] = info['id']
3030 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3033 'type': 'reference',
3034 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3039 for entry in info['list']:
3040 assert entry['type'] == 'reference'
3041 results += self.extract(entry['url'])
3044 class MTVIE(InfoExtractor):
3045 """Information extractor for MTV.com"""
3047 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3050 def report_extraction(self, video_id):
3051 """Report information extraction."""
3052 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3054 def _real_extract(self, url):
3055 mobj = re.match(self._VALID_URL, url)
3057 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3059 if not mobj.group('proto'):
3060 url = 'http://' + url
3061 video_id = mobj.group('videoid')
3063 webpage = self._download_webpage(url, video_id)
3065 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3067 self._downloader.trouble(u'ERROR: unable to extract song name')
3069 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3070 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3072 self._downloader.trouble(u'ERROR: unable to extract performer')
3074 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3075 video_title = performer + ' - ' + song_name
3077 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3079 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3081 mtvn_uri = mobj.group(1)
3083 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3085 self._downloader.trouble(u'ERROR: unable to extract content id')
3087 content_id = mobj.group(1)
3089 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3090 self.report_extraction(video_id)
3091 request = compat_urllib_request.Request(videogen_url)
3093 metadataXml = compat_urllib_request.urlopen(request).read()
3094 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3095 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3098 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3099 renditions = mdoc.findall('.//rendition')
3101 # For now, always pick the highest quality.
3102 rendition = renditions[-1]
3105 _,_,ext = rendition.attrib['type'].partition('/')
3106 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3107 video_url = rendition.find('./src').text
3109 self._downloader.trouble('Invalid rendition field.')
3115 'uploader': performer,
3116 'upload_date': None,
3117 'title': video_title,
3125 class YoukuIE(InfoExtractor):
3126 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3128 def report_download_webpage(self, file_id):
3129 """Report webpage download."""
3130 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3132 def report_extraction(self, file_id):
3133 """Report information extraction."""
3134 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3137 nowTime = int(time.time() * 1000)
3138 random1 = random.randint(1000,1998)
3139 random2 = random.randint(1000,9999)
3141 return "%d%d%d" %(nowTime,random1,random2)
3143 def _get_file_ID_mix_string(self, seed):
3145 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3147 for i in range(len(source)):
3148 seed = (seed * 211 + 30031 ) % 65536
3149 index = math.floor(seed / 65536 * len(source) )
3150 mixed.append(source[int(index)])
3151 source.remove(source[int(index)])
3152 #return ''.join(mixed)
3155 def _get_file_id(self, fileId, seed):
3156 mixed = self._get_file_ID_mix_string(seed)
3157 ids = fileId.split('*')
3161 realId.append(mixed[int(ch)])
3162 return ''.join(realId)
3164 def _real_extract(self, url):
3165 mobj = re.match(self._VALID_URL, url)
3167 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3169 video_id = mobj.group('ID')
3171 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3173 request = compat_urllib_request.Request(info_url, None, std_headers)
3175 self.report_download_webpage(video_id)
3176 jsondata = compat_urllib_request.urlopen(request).read()
3177 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3178 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3181 self.report_extraction(video_id)
3183 jsonstr = jsondata.decode('utf-8')
3184 config = json.loads(jsonstr)
3186 video_title = config['data'][0]['title']
3187 seed = config['data'][0]['seed']
3189 format = self._downloader.params.get('format', None)
3190 supported_format = list(config['data'][0]['streamfileids'].keys())
3192 if format is None or format == 'best':
3193 if 'hd2' in supported_format:
3198 elif format == 'worst':
3206 fileid = config['data'][0]['streamfileids'][format]
3207 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3208 except (UnicodeDecodeError, ValueError, KeyError):
3209 self._downloader.trouble(u'ERROR: unable to extract info section')
3213 sid = self._gen_sid()
3214 fileid = self._get_file_id(fileid, seed)
3216 #column 8,9 of fileid represent the segment number
3217 #fileid[7:9] should be changed
3218 for index, key in enumerate(keys):
3220 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3221 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3224 'id': '%s_part%02d' % (video_id, index),
3225 'url': download_url,
3227 'upload_date': None,
3228 'title': video_title,
3231 files_info.append(info)
3236 class XNXXIE(InfoExtractor):
3237 """Information extractor for xnxx.com"""
3239 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3241 VIDEO_URL_RE = r'flv_url=(.*?)&'
3242 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3243 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3245 def report_webpage(self, video_id):
3246 """Report information extraction"""
3247 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3249 def report_extraction(self, video_id):
3250 """Report information extraction"""
3251 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3253 def _real_extract(self, url):
3254 mobj = re.match(self._VALID_URL, url)
3256 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3258 video_id = mobj.group(1)
3260 self.report_webpage(video_id)
3262 # Get webpage content
3264 webpage_bytes = compat_urllib_request.urlopen(url).read()
3265 webpage = webpage_bytes.decode('utf-8')
3266 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3267 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3270 result = re.search(self.VIDEO_URL_RE, webpage)
3272 self._downloader.trouble(u'ERROR: unable to extract video url')
3274 video_url = compat_urllib_parse.unquote(result.group(1))
3276 result = re.search(self.VIDEO_TITLE_RE, webpage)
3278 self._downloader.trouble(u'ERROR: unable to extract video title')
3280 video_title = result.group(1)
3282 result = re.search(self.VIDEO_THUMB_RE, webpage)
3284 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3286 video_thumbnail = result.group(1)
3292 'upload_date': None,
3293 'title': video_title,
3295 'thumbnail': video_thumbnail,
3296 'description': None,
3300 class GooglePlusIE(InfoExtractor):
3301 """Information extractor for plus.google.com."""
3303 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3304 IE_NAME = u'plus.google'
3306 def __init__(self, downloader=None):
3307 InfoExtractor.__init__(self, downloader)
3309 def report_extract_entry(self, url):
3310 """Report downloading extry"""
3311 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3313 def report_date(self, upload_date):
3314 """Report downloading extry"""
3315 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3317 def report_uploader(self, uploader):
3318 """Report downloading extry"""
3319 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3321 def report_title(self, video_title):
3322 """Report downloading extry"""
3323 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3325 def report_extract_vid_page(self, video_page):
3326 """Report information extraction."""
3327 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3329 def _real_extract(self, url):
3330 # Extract id from URL
3331 mobj = re.match(self._VALID_URL, url)
3333 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3336 post_url = mobj.group(0)
3337 video_id = mobj.group(1)
3339 video_extension = 'flv'
3341 # Step 1, Retrieve post webpage to extract further information
3342 self.report_extract_entry(post_url)
3343 request = compat_urllib_request.Request(post_url)
3345 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3346 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3347 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3350 # Extract update date
3352 pattern = 'title="Timestamp">(.*?)</a>'
3353 mobj = re.search(pattern, webpage)
3355 upload_date = mobj.group(1)
3356 # Convert timestring to a format suitable for filename
3357 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3358 upload_date = upload_date.strftime('%Y%m%d')
3359 self.report_date(upload_date)
3363 pattern = r'rel\="author".*?>(.*?)</a>'
3364 mobj = re.search(pattern, webpage)
3366 uploader = mobj.group(1)
3367 self.report_uploader(uploader)
3370 # Get the first line for title
3372 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3373 mobj = re.search(pattern, webpage)
3375 video_title = mobj.group(1)
3376 self.report_title(video_title)
3378 # Step 2, Stimulate clicking the image box to launch video
3379 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3380 mobj = re.search(pattern, webpage)
3382 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3384 video_page = mobj.group(1)
3385 request = compat_urllib_request.Request(video_page)
3387 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3388 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3389 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3391 self.report_extract_vid_page(video_page)
3394 # Extract video links on video page
3395 """Extract video links of all sizes"""
3396 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3397 mobj = re.findall(pattern, webpage)
3399 self._downloader.trouble(u'ERROR: unable to extract video links')
3401 # Sort in resolution
3402 links = sorted(mobj)
3404 # Choose the lowest of the sort, i.e. highest resolution
3405 video_url = links[-1]
3406 # Only get the url. The resolution part in the tuple has no use anymore
3407 video_url = video_url[-1]
3408 # Treat escaped \u0026 style hex
3410 video_url = video_url.decode("unicode_escape")
3411 except AttributeError: # Python 3
3412 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3418 'uploader': uploader,
3419 'upload_date': upload_date,
3420 'title': video_title,
3421 'ext': video_extension,
3424 class NBAIE(InfoExtractor):
3425 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3428 def _real_extract(self, url):
3429 mobj = re.match(self._VALID_URL, url)
3431 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3434 video_id = mobj.group(1)
3435 if video_id.endswith('/index.html'):
3436 video_id = video_id[:-len('/index.html')]
3438 webpage = self._download_webpage(url, video_id)
3440 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3441 def _findProp(rexp, default=None):
3442 m = re.search(rexp, webpage)
3444 return unescapeHTML(m.group(1))
3448 shortened_video_id = video_id.rpartition('/')[2]
3449 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3451 'id': shortened_video_id,
3455 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3456 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3460 class JustinTVIE(InfoExtractor):
3461 """Information extractor for justin.tv and twitch.tv"""
3462 # TODO: One broadcast may be split into multiple videos. The key
3463 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3464 # starts at 1 and increases. Can we treat all parts as one video?
3466 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3467 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3468 _JUSTIN_PAGE_LIMIT = 100
3469 IE_NAME = u'justin.tv'
3471 def report_extraction(self, file_id):
3472 """Report information extraction."""
3473 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3475 def report_download_page(self, channel, offset):
3476 """Report attempt to download a single page of videos."""
3477 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3478 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3480 # Return count of items, list of *valid* items
3481 def _parse_page(self, url):
3483 urlh = compat_urllib_request.urlopen(url)
3484 webpage_bytes = urlh.read()
3485 webpage = webpage_bytes.decode('utf-8', 'ignore')
3486 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3487 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3490 response = json.loads(webpage)
3491 if type(response) != list:
3492 error_text = response.get('error', 'unknown error')
3493 self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3496 for clip in response:
3497 video_url = clip['video_file_url']
3499 video_extension = os.path.splitext(video_url)[1][1:]
3500 video_date = re.sub('-', '', clip['start_time'][:10])
3501 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3502 video_id = clip['id']
3503 video_title = clip.get('title', video_id)
3507 'title': video_title,
3508 'uploader': clip.get('channel_name', video_uploader_id),
3509 'uploader_id': video_uploader_id,
3510 'upload_date': video_date,
3511 'ext': video_extension,
3513 return (len(response), info)
3515 def _real_extract(self, url):
3516 mobj = re.match(self._VALID_URL, url)
3518 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3521 api = 'http://api.justin.tv'
3522 video_id = mobj.group(mobj.lastindex)
3524 if mobj.lastindex == 1:
3526 api += '/channel/archives/%s.json'
3528 api += '/broadcast/by_archive/%s.json'
3529 api = api % (video_id,)
3531 self.report_extraction(video_id)
3535 limit = self._JUSTIN_PAGE_LIMIT
3538 self.report_download_page(video_id, offset)
3539 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3540 page_count, page_info = self._parse_page(page_url)
3541 info.extend(page_info)
3542 if not paged or page_count != limit:
3547 class FunnyOrDieIE(InfoExtractor):
3548 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3550 def _real_extract(self, url):
3551 mobj = re.match(self._VALID_URL, url)
3553 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3556 video_id = mobj.group('id')
3557 webpage = self._download_webpage(url, video_id)
3559 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3561 self._downloader.trouble(u'ERROR: unable to find video information')
3562 video_url = unescapeHTML(m.group('url'))
3564 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3566 self._downloader.trouble(u'Cannot find video title')
3567 title = unescapeHTML(m.group('title'))
3569 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3571 desc = unescapeHTML(m.group('desc'))
3580 'description': desc,
3584 class TweetReelIE(InfoExtractor):
3585 _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3587 def _real_extract(self, url):
3588 mobj = re.match(self._VALID_URL, url)
3590 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3593 video_id = mobj.group('id')
3594 webpage = self._download_webpage(url, video_id)
3596 m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3598 self._downloader.trouble(u'ERROR: Cannot find status ID')
3599 status_id = m.group(1)
3601 m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3603 self._downloader.trouble(u'WARNING: Cannot find description')
3604 desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3606 m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3608 self._downloader.trouble(u'ERROR: Cannot find uploader')
3609 uploader = unescapeHTML(m.group('uploader'))
3610 uploader_id = unescapeHTML(m.group('uploader_id'))
3612 m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3614 self._downloader.trouble(u'ERROR: Cannot find upload date')
3615 upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3618 video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3625 'description': desc,
3626 'uploader': uploader,
3627 'uploader_id': uploader_id,
3628 'internal_id': status_id,
3629 'upload_date': upload_date
3633 class SteamIE(InfoExtractor):
3634 _VALID_URL = r"""http://store.steampowered.com/
3635 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3637 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3641 def suitable(cls, url):
3642 """Receives a URL and returns True if suitable for this IE."""
3643 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3645 def _real_extract(self, url):
3646 m = re.match(self._VALID_URL, url, re.VERBOSE)
3647 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3648 gameID = m.group('gameID')
3649 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3650 webpage = self._download_webpage(videourl, gameID)
3651 mweb = re.finditer(urlRE, webpage)
3652 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3653 titles = re.finditer(namesRE, webpage)
3654 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3655 thumbs = re.finditer(thumbsRE, webpage)
3657 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3658 video_id = vid.group('videoID')
3659 title = vtitle.group('videoName')
3660 video_url = vid.group('videoURL')
3661 video_thumb = thumb.group('thumbnail')
3663 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3668 'title': unescapeHTML(title),
3669 'thumbnail': video_thumb
3674 class UstreamIE(InfoExtractor):
3675 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3676 IE_NAME = u'ustream'
3678 def _real_extract(self, url):
3679 m = re.match(self._VALID_URL, url)
3680 video_id = m.group('videoID')
3681 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3682 webpage = self._download_webpage(url, video_id)
3683 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3684 title = m.group('title')
3685 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3686 uploader = m.group('uploader')
3692 'uploader': uploader
3696 class RBMARadioIE(InfoExtractor):
3697 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3699 def _real_extract(self, url):
3700 m = re.match(self._VALID_URL, url)
3701 video_id = m.group('videoID')
3703 webpage = self._download_webpage(url, video_id)
3704 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3706 raise ExtractorError(u'Cannot find metadata')
3707 json_data = m.group(1)
3710 data = json.loads(json_data)
3711 except ValueError as e:
3712 raise ExtractorError(u'Invalid JSON: ' + str(e))
3714 video_url = data['akamai_url'] + '&cbr=256'
3715 url_parts = compat_urllib_parse_urlparse(video_url)
3716 video_ext = url_parts.path.rpartition('.')[2]
3721 'title': data['title'],
3722 'description': data.get('teaser_text'),
3723 'location': data.get('country_of_origin'),
3724 'uploader': data.get('host', {}).get('name'),
3725 'uploader_id': data.get('host', {}).get('slug'),
3726 'thumbnail': data.get('image', {}).get('large_url_2x'),
3727 'duration': data.get('duration'),
3732 class YouPornIE(InfoExtractor):
3733 """Information extractor for youporn.com."""
3734 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3736 def _print_formats(self, formats):
3737 """Print all available formats"""
3738 print(u'Available formats:')
3739 print(u'ext\t\tformat')
3740 print(u'---------------------------------')
3741 for format in formats:
3742 print(u'%s\t\t%s' % (format['ext'], format['format']))
3744 def _specific(self, req_format, formats):
3746 if(x["format"]==req_format):
3750 def _real_extract(self, url):
3751 mobj = re.match(self._VALID_URL, url)
3753 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3756 video_id = mobj.group('videoid')
3758 req = compat_urllib_request.Request(url)
3759 req.add_header('Cookie', 'age_verified=1')
3760 webpage = self._download_webpage(req, video_id)
3762 # Get the video title
3763 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3765 raise ExtractorError(u'Unable to extract video title')
3766 video_title = result.group('title').strip()
3768 # Get the video date
3769 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3771 self._downloader.to_stderr(u'WARNING: unable to extract video date')
3774 upload_date = result.group('date').strip()
3776 # Get the video uploader
3777 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3779 self._downloader.to_stderr(u'WARNING: unable to extract uploader')
3780 video_uploader = None
3782 video_uploader = result.group('uploader').strip()
3783 video_uploader = clean_html( video_uploader )
3785 # Get all of the formats available
3786 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3787 result = re.search(DOWNLOAD_LIST_RE, webpage)
3789 raise ExtractorError(u'Unable to extract download list')
3790 download_list_html = result.group('download_list').strip()
3792 # Get all of the links from the page
3793 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3794 links = re.findall(LINK_RE, download_list_html)
3795 if(len(links) == 0):
3796 raise ExtractorError(u'ERROR: no known formats available for video')
3798 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3803 # A link looks like this:
3804 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3805 # A path looks like this:
3806 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3807 video_url = unescapeHTML( link )
3808 path = compat_urllib_parse_urlparse( video_url ).path
3809 extension = os.path.splitext( path )[1][1:]
3810 format = path.split('/')[4].split('_')[:2]
3813 format = "-".join( format )
3814 title = u'%s-%s-%s' % (video_title, size, bitrate)
3819 'uploader': video_uploader,
3820 'upload_date': upload_date,
3825 'description': None,
3829 if self._downloader.params.get('listformats', None):
3830 self._print_formats(formats)
3833 req_format = self._downloader.params.get('format', None)
3834 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3836 if req_format is None or req_format == 'best':
3838 elif req_format == 'worst':
3839 return [formats[-1]]
3840 elif req_format in ('-1', 'all'):
3843 format = self._specific( req_format, formats )
3845 self._downloader.trouble(u'ERROR: requested format not available')
3851 class PornotubeIE(InfoExtractor):
3852 """Information extractor for pornotube.com."""
3853 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3855 def _real_extract(self, url):
3856 mobj = re.match(self._VALID_URL, url)
3858 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3861 video_id = mobj.group('videoid')
3862 video_title = mobj.group('title')
3864 # Get webpage content
3865 webpage = self._download_webpage(url, video_id)
3868 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3869 result = re.search(VIDEO_URL_RE, webpage)
3871 self._downloader.trouble(u'ERROR: unable to extract video url')
3873 video_url = compat_urllib_parse.unquote(result.group('url'))
3875 #Get the uploaded date
3876 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3877 result = re.search(VIDEO_UPLOADED_RE, webpage)
3879 self._downloader.trouble(u'ERROR: unable to extract video title')
3881 upload_date = result.group('date')
3883 info = {'id': video_id,
3886 'upload_date': upload_date,
3887 'title': video_title,
3893 class YouJizzIE(InfoExtractor):
3894 """Information extractor for youjizz.com."""
3895 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3897 def _real_extract(self, url):
3898 mobj = re.match(self._VALID_URL, url)
3900 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3903 video_id = mobj.group('videoid')
3905 # Get webpage content
3906 webpage = self._download_webpage(url, video_id)
3908 # Get the video title
3909 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3911 raise ExtractorError(u'ERROR: unable to extract video title')
3912 video_title = result.group('title').strip()
3914 # Get the embed page
3915 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3917 raise ExtractorError(u'ERROR: unable to extract embed page')
3919 embed_page_url = result.group(0).strip()
3920 video_id = result.group('videoid')
3922 webpage = self._download_webpage(embed_page_url, video_id)
3925 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3927 raise ExtractorError(u'ERROR: unable to extract video url')
3928 video_url = result.group('source')
3930 info = {'id': video_id,
3932 'title': video_title,
3935 'player_url': embed_page_url}
3939 class EightTracksIE(InfoExtractor):
3941 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3943 def _real_extract(self, url):
3944 mobj = re.match(self._VALID_URL, url)
3946 raise ExtractorError(u'Invalid URL: %s' % url)
3947 playlist_id = mobj.group('id')
3949 webpage = self._download_webpage(url, playlist_id)
3951 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3953 raise ExtractorError(u'Cannot find trax information')
3954 json_like = m.group(1)
3955 data = json.loads(json_like)
3957 session = str(random.randint(0, 1000000000))
3959 track_count = data['tracks_count']
3960 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3961 next_url = first_url
3963 for i in itertools.count():
3964 api_json = self._download_webpage(next_url, playlist_id,
3965 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3966 errnote=u'Failed to download song information')
3967 api_data = json.loads(api_json)
3968 track_data = api_data[u'set']['track']
3970 'id': track_data['id'],
3971 'url': track_data['track_file_stream_url'],
3972 'title': track_data['performer'] + u' - ' + track_data['name'],
3973 'raw_title': track_data['name'],
3974 'uploader_id': data['user']['login'],
3978 if api_data['set']['at_last_track']:
3980 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3983 class KeekIE(InfoExtractor):
3984 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3987 def _real_extract(self, url):
3988 m = re.match(self._VALID_URL, url)
3989 video_id = m.group('videoID')
3990 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3991 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3992 webpage = self._download_webpage(url, video_id)
3993 m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
3994 title = unescapeHTML(m.group('title'))
3995 m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
3996 uploader = unescapeHTML(m.group('uploader'))
4002 'thumbnail': thumbnail,
4003 'uploader': uploader
4007 class TEDIE(InfoExtractor):
4008 _VALID_URL=r'''http://www.ted.com/
4010 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
4012 ((?P<type_talk>talks)) # We have a simple talk
4014 /(?P<name>\w+) # Here goes the name and then ".html"
4018 def suitable(cls, url):
4019 """Receives a URL and returns True if suitable for this IE."""
4020 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
4022 def _real_extract(self, url):
4023 m=re.match(self._VALID_URL, url, re.VERBOSE)
4024 if m.group('type_talk'):
4025 return [self._talk_info(url)]
4027 playlist_id=m.group('playlist_id')
4028 name=m.group('name')
4029 self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4030 return self._playlist_videos_info(url,name,playlist_id)
4032 def _talk_video_link(self,mediaSlug):
4033 '''Returns the video link for that mediaSlug'''
4034 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4036 def _playlist_videos_info(self,url,name,playlist_id=0):
4037 '''Returns the videos of the playlist'''
4039 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4040 ([.\s]*?)data-playlist_item_id="(\d+)"
4041 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4043 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4044 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4045 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4046 m_names=re.finditer(video_name_RE,webpage)
4048 for m_video, m_name in zip(m_videos,m_names):
4049 video_id=m_video.group('video_id')
4050 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4051 info.append(self._talk_info(talk_url,video_id))
4054 def _talk_info(self, url, video_id=0):
4055 """Return the video for the talk in the url"""
4056 m=re.match(self._VALID_URL, url,re.VERBOSE)
4057 videoName=m.group('name')
4058 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4059 # If the url includes the language we get the title translated
4060 title_RE=r'<h1><span id="altHeadline" >(?P<title>.*)</span></h1>'
4061 title=re.search(title_RE, webpage).group('title')
4062 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4063 "id":(?P<videoID>[\d]+).*?
4064 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4065 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4066 thumb_match=re.search(thumb_RE,webpage)
4067 info_match=re.search(info_RE,webpage,re.VERBOSE)
4068 video_id=info_match.group('videoID')
4069 mediaSlug=info_match.group('mediaSlug')
4070 video_url=self._talk_video_link(mediaSlug)
4076 'thumbnail': thumb_match.group('thumbnail')
4080 class MySpassIE(InfoExtractor):
4081 _VALID_URL = r'http://www.myspass.de/.*'
4083 def _real_extract(self, url):
4084 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4086 # video id is the last path element of the URL
4087 # usually there is a trailing slash, so also try the second but last
4088 url_path = compat_urllib_parse_urlparse(url).path
4089 url_parent_path, video_id = os.path.split(url_path)
4091 _, video_id = os.path.split(url_parent_path)
4094 metadata_url = META_DATA_URL_TEMPLATE % video_id
4095 metadata_text = self._download_webpage(metadata_url, video_id)
4096 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4098 # extract values from metadata
4099 url_flv_el = metadata.find('url_flv')
4100 if url_flv_el is None:
4101 self._downloader.trouble(u'ERROR: unable to extract download url')
4103 video_url = url_flv_el.text
4104 extension = os.path.splitext(video_url)[1][1:]
4105 title_el = metadata.find('title')
4106 if title_el is None:
4107 self._downloader.trouble(u'ERROR: unable to extract title')
4109 title = title_el.text
4110 format_id_el = metadata.find('format_id')
4111 if format_id_el is None:
4114 format = format_id_el.text
4115 description_el = metadata.find('description')
4116 if description_el is not None:
4117 description = description_el.text
4120 imagePreview_el = metadata.find('imagePreview')
4121 if imagePreview_el is not None:
4122 thumbnail = imagePreview_el.text
4131 'thumbnail': thumbnail,
4132 'description': description
4136 def gen_extractors():
4137 """ Return a list of an instance of every supported extractor.
4138 The order does matter; the first extractor matched is the one handling the URL.
4141 YoutubePlaylistIE(),
4165 StanfordOpenClassroomIE(),