2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
23 class InfoExtractor(object):
24 """Information Extractor class.
26 Information extractors are the classes that, given a URL, extract
27 information about the video (or videos) the URL refers to. This
28 information includes the real video URL, the video title, author and
29 others. The information is stored in a dictionary which is then
30 passed to the FileDownloader. The FileDownloader processes this
31 information possibly downloading the video to the file system, among
32 other possible outcomes.
34 The dictionaries must include the following fields:
38 title: Video title, unescaped.
39 ext: Video filename extension.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnail: Full URL to a video thumbnail image.
45 description: One-line video description.
46 uploader: Full name of the video uploader.
47 upload_date: Video upload date (YYYYMMDD).
48 uploader_id: Nickname or id of the video uploader.
49 location: Physical location of the video.
50 player_url: SWF Player URL (used for rtmpdump).
51 subtitles: The .srt file contents.
52 urlhandle: [internal] The urlHandle to be used to download the file,
53 like returned by urllib.request.urlopen
55 The fields should all be Unicode strings.
57 Subclasses of this one should re-define the _real_initialize() and
58 _real_extract() methods and define a _VALID_URL regexp.
59 Probably, they should also be added to the list of extractors.
61 _real_extract() must return a *list* of information dictionaries as
64 Finally, the _WORKING attribute should be set to False for broken IEs
65 in order to warn the users and skip the tests.
72 def __init__(self, downloader=None):
73 """Constructor. Receives an optional downloader."""
75 self.set_downloader(downloader)
78 def suitable(cls, url):
79 """Receives a URL and returns True if suitable for this IE."""
80 return re.match(cls._VALID_URL, url) is not None
84 """Getter method for _WORKING."""
88 """Initializes an instance (authentication, etc)."""
90 self._real_initialize()
93 def extract(self, url):
94 """Extracts URL information and returns it in list of dicts."""
96 return self._real_extract(url)
98 def set_downloader(self, downloader):
99 """Sets the downloader for this IE."""
100 self._downloader = downloader
102 def _real_initialize(self):
103 """Real initialization process. Redefine in subclasses."""
106 def _real_extract(self, url):
107 """Real extraction process. Redefine in subclasses."""
112 return type(self).__name__[:-2]
114 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
115 """ Returns the response handle """
117 note = u'Downloading video webpage'
118 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
120 return compat_urllib_request.urlopen(url_or_request)
121 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
123 errnote = u'Unable to download webpage'
124 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
126 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
127 """ Returns the data of the page as a string """
128 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
129 webpage_bytes = urlh.read()
130 return webpage_bytes.decode('utf-8', 'replace')
133 class YoutubeIE(InfoExtractor):
134 """Information extractor for youtube.com."""
138 (?:https?://)? # http(s):// (optional)
139 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
140 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
141 (?:.*?\#/)? # handle anchor (#/) redirect urls
142 (?: # the various things that can precede the ID:
143 (?:(?:v|embed|e)/) # v/ or embed/ or e/
144 |(?: # or the v= param in all its forms
145 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
146 (?:\?|\#!?) # the params delimiter ? or # or #!
147 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
150 )? # optional -> youtube.com/xxxx is OK
151 )? # all until now is optional -> you can pass the naked ID
152 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
153 (?(1).+)? # if we found the ID, everything can follow
155 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
156 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
157 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
158 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
159 _NETRC_MACHINE = 'youtube'
160 # Listed in order of quality
161 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
162 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
163 _video_extensions = {
169 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
175 _video_dimensions = {
194 def suitable(cls, url):
195 """Receives a URL and returns True if suitable for this IE."""
196 if YoutubePlaylistIE.suitable(url): return False
197 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
199 def report_lang(self):
200 """Report attempt to set language."""
201 self._downloader.to_screen(u'[youtube] Setting language')
203 def report_login(self):
204 """Report attempt to log in."""
205 self._downloader.to_screen(u'[youtube] Logging in')
207 def report_age_confirmation(self):
208 """Report attempt to confirm age."""
209 self._downloader.to_screen(u'[youtube] Confirming age')
211 def report_video_webpage_download(self, video_id):
212 """Report attempt to download video webpage."""
213 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
215 def report_video_info_webpage_download(self, video_id):
216 """Report attempt to download video info webpage."""
217 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
219 def report_video_subtitles_download(self, video_id):
220 """Report attempt to download video info webpage."""
221 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
223 def report_information_extraction(self, video_id):
224 """Report attempt to extract video information."""
225 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
227 def report_unavailable_format(self, video_id, format):
228 """Report extracted video URL."""
229 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
231 def report_rtmp_download(self):
232 """Indicate the download will use the RTMP protocol."""
233 self._downloader.to_screen(u'[youtube] RTMP download detected')
235 def _closed_captions_xml_to_srt(self, xml_string):
237 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
238 # TODO parse xml instead of regex
239 for n, (start, dur_tag, dur, caption) in enumerate(texts):
240 if not dur: dur = '4'
242 end = start + float(dur)
243 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
244 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
245 caption = unescapeHTML(caption)
246 caption = unescapeHTML(caption) # double cycle, intentional
247 srt += str(n+1) + '\n'
248 srt += start + ' --> ' + end + '\n'
249 srt += caption + '\n\n'
252 def _extract_subtitles(self, video_id):
253 self.report_video_subtitles_download(video_id)
254 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
256 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
257 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
258 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
259 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
260 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
261 if not srt_lang_list:
262 return (u'WARNING: video has no closed captions', None)
263 if self._downloader.params.get('subtitleslang', False):
264 srt_lang = self._downloader.params.get('subtitleslang')
265 elif 'en' in srt_lang_list:
268 srt_lang = list(srt_lang_list.keys())[0]
269 if not srt_lang in srt_lang_list:
270 return (u'WARNING: no closed captions found in the specified language', None)
271 params = compat_urllib_parse.urlencode({
273 'name': srt_lang_list[srt_lang].encode('utf-8'),
276 url = 'http://www.youtube.com/api/timedtext?' + params
278 srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8')
279 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
280 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
282 return (u'WARNING: Did not fetch video subtitles', None)
283 return (None, self._closed_captions_xml_to_srt(srt_xml))
285 def _print_formats(self, formats):
286 print('Available formats:')
288 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
290 def _real_initialize(self):
291 if self._downloader is None:
296 downloader_params = self._downloader.params
298 # Attempt to use provided username and password or .netrc data
299 if downloader_params.get('username', None) is not None:
300 username = downloader_params['username']
301 password = downloader_params['password']
302 elif downloader_params.get('usenetrc', False):
304 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
309 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
310 except (IOError, netrc.NetrcParseError) as err:
311 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
315 request = compat_urllib_request.Request(self._LANG_URL)
318 compat_urllib_request.urlopen(request).read()
319 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
320 self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
323 # No authentication to be performed
327 request = compat_urllib_request.Request(self._LOGIN_URL)
329 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
330 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
331 self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
336 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
338 galx = match.group(1)
340 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
346 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
350 u'PersistentCookie': u'yes',
352 u'bgresponse': u'js_disabled',
353 u'checkConnection': u'',
354 u'checkedDomains': u'youtube',
360 u'signIn': u'Sign in',
362 u'service': u'youtube',
366 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
368 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
369 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
370 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
373 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
374 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
375 self._downloader.report_warning(u'unable to log in: bad username or password')
377 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
378 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
384 'action_confirm': 'Confirm',
386 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
388 self.report_age_confirmation()
389 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
390 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
391 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
394 def _extract_id(self, url):
395 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
397 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
399 video_id = mobj.group(2)
402 def _real_extract(self, url):
403 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
404 mobj = re.search(self._NEXT_URL_RE, url)
406 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
407 video_id = self._extract_id(url)
410 self.report_video_webpage_download(video_id)
411 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
412 request = compat_urllib_request.Request(url)
414 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
415 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
416 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
419 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
421 # Attempt to extract SWF player URL
422 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
424 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
429 self.report_video_info_webpage_download(video_id)
430 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
431 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
432 % (video_id, el_type))
433 request = compat_urllib_request.Request(video_info_url)
435 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
436 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
437 video_info = compat_parse_qs(video_info_webpage)
438 if 'token' in video_info:
440 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
441 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
443 if 'token' not in video_info:
444 if 'reason' in video_info:
445 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
447 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
450 # Check for "rental" videos
451 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
452 self._downloader.trouble(u'ERROR: "rental" videos not supported')
455 # Start extracting information
456 self.report_information_extraction(video_id)
459 if 'author' not in video_info:
460 self._downloader.trouble(u'ERROR: unable to extract uploader name')
462 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
465 video_uploader_id = None
466 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
468 video_uploader_id = mobj.group(1)
470 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
473 if 'title' not in video_info:
474 self._downloader.trouble(u'ERROR: unable to extract video title')
476 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
479 if 'thumbnail_url' not in video_info:
480 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
482 else: # don't panic if we can't find it
483 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
487 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
489 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
490 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
491 for expression in format_expressions:
493 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
498 video_description = get_element_by_id("eow-description", video_webpage)
499 if video_description:
500 video_description = clean_html(video_description)
502 video_description = ''
505 video_subtitles = None
506 if self._downloader.params.get('writesubtitles', False):
507 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
509 self._downloader.trouble(srt_error)
511 if 'length_seconds' not in video_info:
512 self._downloader.trouble(u'WARNING: unable to extract video duration')
515 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
518 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
520 # Decide which formats to download
521 req_format = self._downloader.params.get('format', None)
523 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
524 self.report_rtmp_download()
525 video_url_list = [(None, video_info['conn'][0])]
526 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
527 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
528 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
529 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
530 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
532 format_limit = self._downloader.params.get('format_limit', None)
533 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
534 if format_limit is not None and format_limit in available_formats:
535 format_list = available_formats[available_formats.index(format_limit):]
537 format_list = available_formats
538 existing_formats = [x for x in format_list if x in url_map]
539 if len(existing_formats) == 0:
540 self._downloader.trouble(u'ERROR: no known formats available for video')
542 if self._downloader.params.get('listformats', None):
543 self._print_formats(existing_formats)
545 if req_format is None or req_format == 'best':
546 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
547 elif req_format == 'worst':
548 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
549 elif req_format in ('-1', 'all'):
550 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
552 # Specific formats. We pick the first in a slash-delimeted sequence.
553 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
554 req_formats = req_format.split('/')
555 video_url_list = None
556 for rf in req_formats:
558 video_url_list = [(rf, url_map[rf])]
560 if video_url_list is None:
561 self._downloader.trouble(u'ERROR: requested format not available')
564 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
568 for format_param, video_real_url in video_url_list:
570 video_extension = self._video_extensions.get(format_param, 'flv')
572 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
573 self._video_dimensions.get(format_param, '???'))
577 'url': video_real_url,
578 'uploader': video_uploader,
579 'uploader_id': video_uploader_id,
580 'upload_date': upload_date,
581 'title': video_title,
582 'ext': video_extension,
583 'format': video_format,
584 'thumbnail': video_thumbnail,
585 'description': video_description,
586 'player_url': player_url,
587 'subtitles': video_subtitles,
588 'duration': video_duration
593 class MetacafeIE(InfoExtractor):
594 """Information Extractor for metacafe.com."""
596 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
597 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
598 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
599 IE_NAME = u'metacafe'
601 def __init__(self, downloader=None):
602 InfoExtractor.__init__(self, downloader)
604 def report_disclaimer(self):
605 """Report disclaimer retrieval."""
606 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
608 def report_age_confirmation(self):
609 """Report attempt to confirm age."""
610 self._downloader.to_screen(u'[metacafe] Confirming age')
612 def report_download_webpage(self, video_id):
613 """Report webpage download."""
614 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
616 def report_extraction(self, video_id):
617 """Report information extraction."""
618 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
620 def _real_initialize(self):
621 # Retrieve disclaimer
622 request = compat_urllib_request.Request(self._DISCLAIMER)
624 self.report_disclaimer()
625 disclaimer = compat_urllib_request.urlopen(request).read()
626 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
627 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
633 'submit': "Continue - I'm over 18",
635 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
637 self.report_age_confirmation()
638 disclaimer = compat_urllib_request.urlopen(request).read()
639 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
640 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
643 def _real_extract(self, url):
644 # Extract id and simplified title from URL
645 mobj = re.match(self._VALID_URL, url)
647 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
650 video_id = mobj.group(1)
652 # Check if video comes from YouTube
653 mobj2 = re.match(r'^yt-(.*)$', video_id)
654 if mobj2 is not None:
655 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
658 # Retrieve video webpage to extract further information
659 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
661 self.report_download_webpage(video_id)
662 webpage = compat_urllib_request.urlopen(request).read()
663 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
664 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
667 # Extract URL, uploader and title from webpage
668 self.report_extraction(video_id)
669 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
671 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
672 video_extension = mediaURL[-3:]
674 # Extract gdaKey if available
675 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
679 gdaKey = mobj.group(1)
680 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
682 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
684 self._downloader.trouble(u'ERROR: unable to extract media URL')
686 vardict = compat_parse_qs(mobj.group(1))
687 if 'mediaData' not in vardict:
688 self._downloader.trouble(u'ERROR: unable to extract media URL')
690 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
692 self._downloader.trouble(u'ERROR: unable to extract media URL')
694 mediaURL = mobj.group(1).replace('\\/', '/')
695 video_extension = mediaURL[-3:]
696 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
698 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
700 self._downloader.trouble(u'ERROR: unable to extract title')
702 video_title = mobj.group(1).decode('utf-8')
704 mobj = re.search(r'submitter=(.*?);', webpage)
706 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
708 video_uploader = mobj.group(1)
711 'id': video_id.decode('utf-8'),
712 'url': video_url.decode('utf-8'),
713 'uploader': video_uploader.decode('utf-8'),
715 'title': video_title,
716 'ext': video_extension.decode('utf-8'),
720 class DailymotionIE(InfoExtractor):
721 """Information Extractor for Dailymotion"""
723 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
724 IE_NAME = u'dailymotion'
727 def __init__(self, downloader=None):
728 InfoExtractor.__init__(self, downloader)
730 def report_extraction(self, video_id):
731 """Report information extraction."""
732 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
734 def _real_extract(self, url):
735 # Extract id and simplified title from URL
736 mobj = re.match(self._VALID_URL, url)
738 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
741 video_id = mobj.group(1).split('_')[0].split('?')[0]
743 video_extension = 'mp4'
745 # Retrieve video webpage to extract further information
746 request = compat_urllib_request.Request(url)
747 request.add_header('Cookie', 'family_filter=off')
748 webpage = self._download_webpage(request, video_id)
750 # Extract URL, uploader and title from webpage
751 self.report_extraction(video_id)
752 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
754 self._downloader.trouble(u'ERROR: unable to extract media URL')
756 flashvars = compat_urllib_parse.unquote(mobj.group(1))
758 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
761 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
764 self._downloader.trouble(u'ERROR: unable to extract video URL')
767 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
769 self._downloader.trouble(u'ERROR: unable to extract video URL')
772 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
774 # TODO: support choosing qualities
776 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
778 self._downloader.trouble(u'ERROR: unable to extract title')
780 video_title = unescapeHTML(mobj.group('title'))
782 video_uploader = None
783 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
785 # lookin for official user
786 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
787 if mobj_official is None:
788 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
790 video_uploader = mobj_official.group(1)
792 video_uploader = mobj.group(1)
794 video_upload_date = None
795 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
797 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
802 'uploader': video_uploader,
803 'upload_date': video_upload_date,
804 'title': video_title,
805 'ext': video_extension,
809 class PhotobucketIE(InfoExtractor):
810 """Information extractor for photobucket.com."""
812 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
813 IE_NAME = u'photobucket'
815 def __init__(self, downloader=None):
816 InfoExtractor.__init__(self, downloader)
818 def report_download_webpage(self, video_id):
819 """Report webpage download."""
820 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
822 def report_extraction(self, video_id):
823 """Report information extraction."""
824 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
826 def _real_extract(self, url):
827 # Extract id from URL
828 mobj = re.match(self._VALID_URL, url)
830 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
833 video_id = mobj.group(1)
835 video_extension = 'flv'
837 # Retrieve video webpage to extract further information
838 request = compat_urllib_request.Request(url)
840 self.report_download_webpage(video_id)
841 webpage = compat_urllib_request.urlopen(request).read()
842 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
843 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
846 # Extract URL, uploader, and title from webpage
847 self.report_extraction(video_id)
848 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
850 self._downloader.trouble(u'ERROR: unable to extract media URL')
852 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
856 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
858 self._downloader.trouble(u'ERROR: unable to extract title')
860 video_title = mobj.group(1).decode('utf-8')
862 video_uploader = mobj.group(2).decode('utf-8')
865 'id': video_id.decode('utf-8'),
866 'url': video_url.decode('utf-8'),
867 'uploader': video_uploader,
869 'title': video_title,
870 'ext': video_extension.decode('utf-8'),
874 class YahooIE(InfoExtractor):
875 """Information extractor for video.yahoo.com."""
878 # _VALID_URL matches all Yahoo! Video URLs
879 # _VPAGE_URL matches only the extractable '/watch/' URLs
880 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
881 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
882 IE_NAME = u'video.yahoo'
884 def __init__(self, downloader=None):
885 InfoExtractor.__init__(self, downloader)
887 def report_download_webpage(self, video_id):
888 """Report webpage download."""
889 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
891 def report_extraction(self, video_id):
892 """Report information extraction."""
893 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
895 def _real_extract(self, url, new_video=True):
896 # Extract ID from URL
897 mobj = re.match(self._VALID_URL, url)
899 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
902 video_id = mobj.group(2)
903 video_extension = 'flv'
905 # Rewrite valid but non-extractable URLs as
906 # extractable English language /watch/ URLs
907 if re.match(self._VPAGE_URL, url) is None:
908 request = compat_urllib_request.Request(url)
910 webpage = compat_urllib_request.urlopen(request).read()
911 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
912 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
915 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
917 self._downloader.trouble(u'ERROR: Unable to extract id field')
919 yahoo_id = mobj.group(1)
921 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
923 self._downloader.trouble(u'ERROR: Unable to extract vid field')
925 yahoo_vid = mobj.group(1)
927 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
928 return self._real_extract(url, new_video=False)
930 # Retrieve video webpage to extract further information
931 request = compat_urllib_request.Request(url)
933 self.report_download_webpage(video_id)
934 webpage = compat_urllib_request.urlopen(request).read()
935 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
936 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
939 # Extract uploader and title from webpage
940 self.report_extraction(video_id)
941 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
943 self._downloader.trouble(u'ERROR: unable to extract video title')
945 video_title = mobj.group(1).decode('utf-8')
947 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
949 self._downloader.trouble(u'ERROR: unable to extract video uploader')
951 video_uploader = mobj.group(1).decode('utf-8')
953 # Extract video thumbnail
954 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
956 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
958 video_thumbnail = mobj.group(1).decode('utf-8')
960 # Extract video description
961 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
963 self._downloader.trouble(u'ERROR: unable to extract video description')
965 video_description = mobj.group(1).decode('utf-8')
966 if not video_description:
967 video_description = 'No description available.'
969 # Extract video height and width
970 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
972 self._downloader.trouble(u'ERROR: unable to extract video height')
974 yv_video_height = mobj.group(1)
976 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
978 self._downloader.trouble(u'ERROR: unable to extract video width')
980 yv_video_width = mobj.group(1)
982 # Retrieve video playlist to extract media URL
983 # I'm not completely sure what all these options are, but we
984 # seem to need most of them, otherwise the server sends a 401.
985 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
986 yv_bitrate = '700' # according to Wikipedia this is hard-coded
987 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
988 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
989 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
991 self.report_download_webpage(video_id)
992 webpage = compat_urllib_request.urlopen(request).read()
993 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
994 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
997 # Extract media URL from playlist XML
998 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
1000 self._downloader.trouble(u'ERROR: Unable to extract media URL')
1002 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
1003 video_url = unescapeHTML(video_url)
1006 'id': video_id.decode('utf-8'),
1008 'uploader': video_uploader,
1009 'upload_date': None,
1010 'title': video_title,
1011 'ext': video_extension.decode('utf-8'),
1012 'thumbnail': video_thumbnail.decode('utf-8'),
1013 'description': video_description,
1017 class VimeoIE(InfoExtractor):
1018 """Information extractor for vimeo.com."""
1020 # _VALID_URL matches Vimeo URLs
1021 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1024 def __init__(self, downloader=None):
1025 InfoExtractor.__init__(self, downloader)
1027 def report_download_webpage(self, video_id):
1028 """Report webpage download."""
1029 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1031 def report_extraction(self, video_id):
1032 """Report information extraction."""
1033 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1035 def _real_extract(self, url, new_video=True):
1036 # Extract ID from URL
1037 mobj = re.match(self._VALID_URL, url)
1039 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1042 video_id = mobj.group('id')
1043 if not mobj.group('proto'):
1044 url = 'https://' + url
1045 if mobj.group('direct_link'):
1046 url = 'https://vimeo.com/' + video_id
1048 # Retrieve video webpage to extract further information
1049 request = compat_urllib_request.Request(url, None, std_headers)
1051 self.report_download_webpage(video_id)
1052 webpage_bytes = compat_urllib_request.urlopen(request).read()
1053 webpage = webpage_bytes.decode('utf-8')
1054 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1055 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1058 # Now we begin extracting as much information as we can from what we
1059 # retrieved. First we extract the information common to all extractors,
1060 # and latter we extract those that are Vimeo specific.
1061 self.report_extraction(video_id)
1063 # Extract the config JSON
1065 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1066 config = json.loads(config)
1068 self._downloader.trouble(u'ERROR: unable to extract info section')
1072 video_title = config["video"]["title"]
1074 # Extract uploader and uploader_id
1075 video_uploader = config["video"]["owner"]["name"]
1076 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1078 # Extract video thumbnail
1079 video_thumbnail = config["video"]["thumbnail"]
1081 # Extract video description
1082 video_description = get_element_by_attribute("itemprop", "description", webpage)
1083 if video_description: video_description = clean_html(video_description)
1084 else: video_description = ''
1086 # Extract upload date
1087 video_upload_date = None
1088 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1089 if mobj is not None:
1090 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1092 # Vimeo specific: extract request signature and timestamp
1093 sig = config['request']['signature']
1094 timestamp = config['request']['timestamp']
1096 # Vimeo specific: extract video codec and quality information
1097 # First consider quality, then codecs, then take everything
1098 # TODO bind to format param
1099 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1100 files = { 'hd': [], 'sd': [], 'other': []}
1101 for codec_name, codec_extension in codecs:
1102 if codec_name in config["video"]["files"]:
1103 if 'hd' in config["video"]["files"][codec_name]:
1104 files['hd'].append((codec_name, codec_extension, 'hd'))
1105 elif 'sd' in config["video"]["files"][codec_name]:
1106 files['sd'].append((codec_name, codec_extension, 'sd'))
1108 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1110 for quality in ('hd', 'sd', 'other'):
1111 if len(files[quality]) > 0:
1112 video_quality = files[quality][0][2]
1113 video_codec = files[quality][0][0]
1114 video_extension = files[quality][0][1]
1115 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1118 self._downloader.trouble(u'ERROR: no known codec found')
1121 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1122 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1127 'uploader': video_uploader,
1128 'uploader_id': video_uploader_id,
1129 'upload_date': video_upload_date,
1130 'title': video_title,
1131 'ext': video_extension,
1132 'thumbnail': video_thumbnail,
1133 'description': video_description,
1137 class ArteTvIE(InfoExtractor):
1138 """arte.tv information extractor."""
1140 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1141 _LIVE_URL = r'index-[0-9]+\.html$'
1143 IE_NAME = u'arte.tv'
1145 def __init__(self, downloader=None):
1146 InfoExtractor.__init__(self, downloader)
1148 def report_download_webpage(self, video_id):
1149 """Report webpage download."""
1150 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1152 def report_extraction(self, video_id):
1153 """Report information extraction."""
1154 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1156 def fetch_webpage(self, url):
1157 request = compat_urllib_request.Request(url)
1159 self.report_download_webpage(url)
1160 webpage = compat_urllib_request.urlopen(request).read()
1161 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1162 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1164 except ValueError as err:
1165 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1169 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1170 page = self.fetch_webpage(url)
1171 mobj = re.search(regex, page, regexFlags)
1175 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1178 for (i, key, err) in matchTuples:
1179 if mobj.group(i) is None:
1180 self._downloader.trouble(err)
1183 info[key] = mobj.group(i)
1187 def extractLiveStream(self, url):
1188 video_lang = url.split('/')[-4]
1189 info = self.grep_webpage(
1191 r'src="(.*?/videothek_js.*?\.js)',
1194 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1197 http_host = url.split('/')[2]
1198 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1199 info = self.grep_webpage(
1201 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1202 '(http://.*?\.swf).*?' +
1206 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1207 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1208 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1211 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1213 def extractPlus7Stream(self, url):
1214 video_lang = url.split('/')[-3]
1215 info = self.grep_webpage(
1217 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1220 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1223 next_url = compat_urllib_parse.unquote(info.get('url'))
1224 info = self.grep_webpage(
1226 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1229 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1232 next_url = compat_urllib_parse.unquote(info.get('url'))
1234 info = self.grep_webpage(
1236 r'<video id="(.*?)".*?>.*?' +
1237 '<name>(.*?)</name>.*?' +
1238 '<dateVideo>(.*?)</dateVideo>.*?' +
1239 '<url quality="hd">(.*?)</url>',
1242 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1243 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1244 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1245 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1250 'id': info.get('id'),
1251 'url': compat_urllib_parse.unquote(info.get('url')),
1252 'uploader': u'arte.tv',
1253 'upload_date': info.get('date'),
1254 'title': info.get('title').decode('utf-8'),
1260 def _real_extract(self, url):
1261 video_id = url.split('/')[-1]
1262 self.report_extraction(video_id)
1264 if re.search(self._LIVE_URL, video_id) is not None:
1265 self.extractLiveStream(url)
1268 info = self.extractPlus7Stream(url)
1273 class GenericIE(InfoExtractor):
1274 """Generic last-resort information extractor."""
1277 IE_NAME = u'generic'
1279 def __init__(self, downloader=None):
1280 InfoExtractor.__init__(self, downloader)
1282 def report_download_webpage(self, video_id):
1283 """Report webpage download."""
1284 if not self._downloader.params.get('test', False):
1285 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1286 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1288 def report_extraction(self, video_id):
1289 """Report information extraction."""
1290 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1292 def report_following_redirect(self, new_url):
1293 """Report information extraction."""
1294 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1296 def _test_redirect(self, url):
1297 """Check if it is a redirect, like url shorteners, in case restart chain."""
1298 class HeadRequest(compat_urllib_request.Request):
1299 def get_method(self):
1302 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1304 Subclass the HTTPRedirectHandler to make it use our
1305 HeadRequest also on the redirected URL
1307 def redirect_request(self, req, fp, code, msg, headers, newurl):
1308 if code in (301, 302, 303, 307):
1309 newurl = newurl.replace(' ', '%20')
1310 newheaders = dict((k,v) for k,v in req.headers.items()
1311 if k.lower() not in ("content-length", "content-type"))
1312 return HeadRequest(newurl,
1314 origin_req_host=req.get_origin_req_host(),
1317 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1319 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1321 Fallback to GET if HEAD is not allowed (405 HTTP error)
1323 def http_error_405(self, req, fp, code, msg, headers):
1327 newheaders = dict((k,v) for k,v in req.headers.items()
1328 if k.lower() not in ("content-length", "content-type"))
1329 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1331 origin_req_host=req.get_origin_req_host(),
1335 opener = compat_urllib_request.OpenerDirector()
1336 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1337 HTTPMethodFallback, HEADRedirectHandler,
1338 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1339 opener.add_handler(handler())
1341 response = opener.open(HeadRequest(url))
1342 new_url = response.geturl()
1347 self.report_following_redirect(new_url)
1348 self._downloader.download([new_url])
1351 def _real_extract(self, url):
1352 if self._test_redirect(url): return
1354 video_id = url.split('/')[-1]
1356 webpage = self._download_webpage(url, video_id)
1357 except ValueError as err:
1358 # since this is the last-resort InfoExtractor, if
1359 # this error is thrown, it'll be thrown here
1360 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1363 self.report_extraction(video_id)
1364 # Start with something easy: JW Player in SWFObject
1365 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1367 # Broaden the search a little bit
1368 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1370 # Broaden the search a little bit: JWPlayer JS loader
1371 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1373 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1376 # It's possible that one of the regexes
1377 # matched, but returned an empty group:
1378 if mobj.group(1) is None:
1379 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1382 video_url = compat_urllib_parse.unquote(mobj.group(1))
1383 video_id = os.path.basename(video_url)
1385 # here's a fun little line of code for you:
1386 video_extension = os.path.splitext(video_id)[1][1:]
1387 video_id = os.path.splitext(video_id)[0]
1389 # it's tempting to parse this further, but you would
1390 # have to take into account all the variations like
1391 # Video Title - Site Name
1392 # Site Name | Video Title
1393 # Video Title - Tagline | Site Name
1394 # and so on and so forth; it's just not practical
1395 mobj = re.search(r'<title>(.*)</title>', webpage)
1397 self._downloader.trouble(u'ERROR: unable to extract title')
1399 video_title = mobj.group(1)
1401 # video uploader is domain name
1402 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1404 self._downloader.trouble(u'ERROR: unable to extract title')
1406 video_uploader = mobj.group(1)
1411 'uploader': video_uploader,
1412 'upload_date': None,
1413 'title': video_title,
1414 'ext': video_extension,
1418 class YoutubeSearchIE(InfoExtractor):
1419 """Information Extractor for YouTube search queries."""
1420 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1421 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1422 _max_youtube_results = 1000
1423 IE_NAME = u'youtube:search'
1425 def __init__(self, downloader=None):
1426 InfoExtractor.__init__(self, downloader)
1428 def report_download_page(self, query, pagenum):
1429 """Report attempt to download search page with given number."""
1430 query = query.decode(preferredencoding())
1431 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1433 def _real_extract(self, query):
1434 mobj = re.match(self._VALID_URL, query)
1436 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1439 prefix, query = query.split(':')
1441 query = query.encode('utf-8')
1443 self._download_n_results(query, 1)
1445 elif prefix == 'all':
1446 self._download_n_results(query, self._max_youtube_results)
1452 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1454 elif n > self._max_youtube_results:
1455 self._downloader.report_warning(u'ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1456 n = self._max_youtube_results
1457 self._download_n_results(query, n)
1459 except ValueError: # parsing prefix as integer fails
1460 self._download_n_results(query, 1)
1463 def _download_n_results(self, query, n):
1464 """Downloads a specified number of results for a query"""
1470 while (50 * pagenum) < limit:
1471 self.report_download_page(query, pagenum+1)
1472 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1473 request = compat_urllib_request.Request(result_url)
1475 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1476 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1477 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1479 api_response = json.loads(data)['data']
1481 if not 'items' in api_response:
1482 self._downloader.trouble(u'[youtube] No video results')
1485 new_ids = list(video['id'] for video in api_response['items'])
1486 video_ids += new_ids
1488 limit = min(n, api_response['totalItems'])
1491 if len(video_ids) > n:
1492 video_ids = video_ids[:n]
1493 for id in video_ids:
1494 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1498 class GoogleSearchIE(InfoExtractor):
1499 """Information Extractor for Google Video search queries."""
1500 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1501 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1502 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1503 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1504 _max_google_results = 1000
1505 IE_NAME = u'video.google:search'
1507 def __init__(self, downloader=None):
1508 InfoExtractor.__init__(self, downloader)
1510 def report_download_page(self, query, pagenum):
1511 """Report attempt to download playlist page with given number."""
1512 query = query.decode(preferredencoding())
1513 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1515 def _real_extract(self, query):
1516 mobj = re.match(self._VALID_URL, query)
1518 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1521 prefix, query = query.split(':')
1523 query = query.encode('utf-8')
1525 self._download_n_results(query, 1)
1527 elif prefix == 'all':
1528 self._download_n_results(query, self._max_google_results)
1534 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1536 elif n > self._max_google_results:
1537 self._downloader.report_warning(u'gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1538 n = self._max_google_results
1539 self._download_n_results(query, n)
1541 except ValueError: # parsing prefix as integer fails
1542 self._download_n_results(query, 1)
1545 def _download_n_results(self, query, n):
1546 """Downloads a specified number of results for a query"""
1552 self.report_download_page(query, pagenum)
1553 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1554 request = compat_urllib_request.Request(result_url)
1556 page = compat_urllib_request.urlopen(request).read()
1557 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1558 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1561 # Extract video identifiers
1562 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1563 video_id = mobj.group(1)
1564 if video_id not in video_ids:
1565 video_ids.append(video_id)
1566 if len(video_ids) == n:
1567 # Specified n videos reached
1568 for id in video_ids:
1569 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1572 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1573 for id in video_ids:
1574 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1577 pagenum = pagenum + 1
1580 class YahooSearchIE(InfoExtractor):
1581 """Information Extractor for Yahoo! Video search queries."""
1584 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1585 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1586 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1587 _MORE_PAGES_INDICATOR = r'\s*Next'
1588 _max_yahoo_results = 1000
1589 IE_NAME = u'video.yahoo:search'
1591 def __init__(self, downloader=None):
1592 InfoExtractor.__init__(self, downloader)
1594 def report_download_page(self, query, pagenum):
1595 """Report attempt to download playlist page with given number."""
1596 query = query.decode(preferredencoding())
1597 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1599 def _real_extract(self, query):
1600 mobj = re.match(self._VALID_URL, query)
1602 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1605 prefix, query = query.split(':')
1607 query = query.encode('utf-8')
1609 self._download_n_results(query, 1)
1611 elif prefix == 'all':
1612 self._download_n_results(query, self._max_yahoo_results)
1618 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1620 elif n > self._max_yahoo_results:
1621 self._downloader.report_warning(u'yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1622 n = self._max_yahoo_results
1623 self._download_n_results(query, n)
1625 except ValueError: # parsing prefix as integer fails
1626 self._download_n_results(query, 1)
1629 def _download_n_results(self, query, n):
1630 """Downloads a specified number of results for a query"""
1633 already_seen = set()
1637 self.report_download_page(query, pagenum)
1638 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1639 request = compat_urllib_request.Request(result_url)
1641 page = compat_urllib_request.urlopen(request).read()
1642 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1643 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1646 # Extract video identifiers
1647 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1648 video_id = mobj.group(1)
1649 if video_id not in already_seen:
1650 video_ids.append(video_id)
1651 already_seen.add(video_id)
1652 if len(video_ids) == n:
1653 # Specified n videos reached
1654 for id in video_ids:
1655 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1658 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1659 for id in video_ids:
1660 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1663 pagenum = pagenum + 1
1666 class YoutubePlaylistIE(InfoExtractor):
1667 """Information Extractor for YouTube playlists."""
1669 _VALID_URL = r"""(?:
1674 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1675 \? (?:.*?&)*? (?:p|a|list)=
1680 ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
1683 ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
1685 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
1687 IE_NAME = u'youtube:playlist'
1689 def __init__(self, downloader=None):
1690 InfoExtractor.__init__(self, downloader)
1693 def suitable(cls, url):
1694 """Receives a URL and returns True if suitable for this IE."""
1695 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1697 def report_download_page(self, playlist_id, pagenum):
1698 """Report attempt to download playlist page with given number."""
1699 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1701 def _real_extract(self, url):
1702 # Extract playlist id
1703 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1705 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1708 # Download playlist videos from API
1709 playlist_id = mobj.group(1) or mobj.group(2)
1714 self.report_download_page(playlist_id, page_num)
1716 url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
1718 page = compat_urllib_request.urlopen(url).read().decode('utf8')
1719 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1720 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1724 response = json.loads(page)
1725 except ValueError as err:
1726 self._downloader.trouble(u'ERROR: Invalid JSON in API response: ' + compat_str(err))
1729 if not 'feed' in response or not 'entry' in response['feed']:
1730 self._downloader.trouble(u'ERROR: Got a malformed response from YouTube API')
1732 videos += [ (entry['yt$position']['$t'], entry['content']['src'])
1733 for entry in response['feed']['entry']
1734 if 'content' in entry ]
1736 if len(response['feed']['entry']) < self._MAX_RESULTS:
1740 videos = [v[1] for v in sorted(videos)]
1743 playliststart = self._downloader.params.get('playliststart', 1) - 1
1744 playlistend = self._downloader.params.get('playlistend', -1)
1745 if playlistend == -1:
1746 videos = videos[playliststart:]
1748 videos = videos[playliststart:playlistend]
1750 if len(videos) == total:
1751 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1753 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
1755 for video in videos:
1756 self._downloader.download([video])
1760 class YoutubeChannelIE(InfoExtractor):
1761 """Information Extractor for YouTube channels."""
1763 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1764 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1765 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1766 IE_NAME = u'youtube:channel'
1768 def report_download_page(self, channel_id, pagenum):
1769 """Report attempt to download channel page with given number."""
1770 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1772 def _real_extract(self, url):
1773 # Extract channel id
1774 mobj = re.match(self._VALID_URL, url)
1776 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1779 # Download channel pages
1780 channel_id = mobj.group(1)
1785 self.report_download_page(channel_id, pagenum)
1786 url = self._TEMPLATE_URL % (channel_id, pagenum)
1787 request = compat_urllib_request.Request(url)
1789 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1790 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1791 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1794 # Extract video identifiers
1796 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1797 if mobj.group(1) not in ids_in_page:
1798 ids_in_page.append(mobj.group(1))
1799 video_ids.extend(ids_in_page)
1801 if self._MORE_PAGES_INDICATOR not in page:
1803 pagenum = pagenum + 1
1805 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1807 for id in video_ids:
1808 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1812 class YoutubeUserIE(InfoExtractor):
1813 """Information Extractor for YouTube users."""
1815 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1816 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1817 _GDATA_PAGE_SIZE = 50
1818 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1819 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1820 IE_NAME = u'youtube:user'
1822 def __init__(self, downloader=None):
1823 InfoExtractor.__init__(self, downloader)
1825 def report_download_page(self, username, start_index):
1826 """Report attempt to download user page."""
1827 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1828 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1830 def _real_extract(self, url):
1832 mobj = re.match(self._VALID_URL, url)
1834 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1837 username = mobj.group(1)
1839 # Download video ids using YouTube Data API. Result size per
1840 # query is limited (currently to 50 videos) so we need to query
1841 # page by page until there are no video ids - it means we got
1848 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1849 self.report_download_page(username, start_index)
1851 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1854 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1855 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1856 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1859 # Extract video identifiers
1862 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1863 if mobj.group(1) not in ids_in_page:
1864 ids_in_page.append(mobj.group(1))
1866 video_ids.extend(ids_in_page)
1868 # A little optimization - if current page is not
1869 # "full", ie. does not contain PAGE_SIZE video ids then
1870 # we can assume that this page is the last one - there
1871 # are no more ids on further pages - no need to query
1874 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1879 all_ids_count = len(video_ids)
1880 playliststart = self._downloader.params.get('playliststart', 1) - 1
1881 playlistend = self._downloader.params.get('playlistend', -1)
1883 if playlistend == -1:
1884 video_ids = video_ids[playliststart:]
1886 video_ids = video_ids[playliststart:playlistend]
1888 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1889 (username, all_ids_count, len(video_ids)))
1891 for video_id in video_ids:
1892 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1895 class BlipTVUserIE(InfoExtractor):
1896 """Information Extractor for blip.tv users."""
1898 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1900 IE_NAME = u'blip.tv:user'
1902 def __init__(self, downloader=None):
1903 InfoExtractor.__init__(self, downloader)
1905 def report_download_page(self, username, pagenum):
1906 """Report attempt to download user page."""
1907 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1908 (self.IE_NAME, username, pagenum))
1910 def _real_extract(self, url):
1912 mobj = re.match(self._VALID_URL, url)
1914 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1917 username = mobj.group(1)
1919 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1921 request = compat_urllib_request.Request(url)
1924 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1925 mobj = re.search(r'data-users-id="([^"]+)"', page)
1926 page_base = page_base % mobj.group(1)
1927 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1928 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1932 # Download video ids using BlipTV Ajax calls. Result size per
1933 # query is limited (currently to 12 videos) so we need to query
1934 # page by page until there are no video ids - it means we got
1941 self.report_download_page(username, pagenum)
1942 url = page_base + "&page=" + str(pagenum)
1943 request = compat_urllib_request.Request( url )
1945 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1946 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1947 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1950 # Extract video identifiers
1953 for mobj in re.finditer(r'href="/([^"]+)"', page):
1954 if mobj.group(1) not in ids_in_page:
1955 ids_in_page.append(unescapeHTML(mobj.group(1)))
1957 video_ids.extend(ids_in_page)
1959 # A little optimization - if current page is not
1960 # "full", ie. does not contain PAGE_SIZE video ids then
1961 # we can assume that this page is the last one - there
1962 # are no more ids on further pages - no need to query
1965 if len(ids_in_page) < self._PAGE_SIZE:
1970 all_ids_count = len(video_ids)
1971 playliststart = self._downloader.params.get('playliststart', 1) - 1
1972 playlistend = self._downloader.params.get('playlistend', -1)
1974 if playlistend == -1:
1975 video_ids = video_ids[playliststart:]
1977 video_ids = video_ids[playliststart:playlistend]
1979 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1980 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1982 for video_id in video_ids:
1983 self._downloader.download([u'http://blip.tv/'+video_id])
1986 class DepositFilesIE(InfoExtractor):
1987 """Information extractor for depositfiles.com"""
1989 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1991 def report_download_webpage(self, file_id):
1992 """Report webpage download."""
1993 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1995 def report_extraction(self, file_id):
1996 """Report information extraction."""
1997 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1999 def _real_extract(self, url):
2000 file_id = url.split('/')[-1]
2001 # Rebuild url in english locale
2002 url = 'http://depositfiles.com/en/files/' + file_id
2004 # Retrieve file webpage with 'Free download' button pressed
2005 free_download_indication = { 'gateway_result' : '1' }
2006 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
2008 self.report_download_webpage(file_id)
2009 webpage = compat_urllib_request.urlopen(request).read()
2010 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2011 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
2014 # Search for the real file URL
2015 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2016 if (mobj is None) or (mobj.group(1) is None):
2017 # Try to figure out reason of the error.
2018 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2019 if (mobj is not None) and (mobj.group(1) is not None):
2020 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2021 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2023 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2026 file_url = mobj.group(1)
2027 file_extension = os.path.splitext(file_url)[1][1:]
2029 # Search for file title
2030 mobj = re.search(r'<b title="(.*?)">', webpage)
2032 self._downloader.trouble(u'ERROR: unable to extract title')
2034 file_title = mobj.group(1).decode('utf-8')
2037 'id': file_id.decode('utf-8'),
2038 'url': file_url.decode('utf-8'),
2040 'upload_date': None,
2041 'title': file_title,
2042 'ext': file_extension.decode('utf-8'),
2046 class FacebookIE(InfoExtractor):
2047 """Information Extractor for Facebook"""
2049 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2050 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2051 _NETRC_MACHINE = 'facebook'
2052 IE_NAME = u'facebook'
2054 def report_login(self):
2055 """Report attempt to log in."""
2056 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2058 def _real_initialize(self):
2059 if self._downloader is None:
2064 downloader_params = self._downloader.params
2066 # Attempt to use provided username and password or .netrc data
2067 if downloader_params.get('username', None) is not None:
2068 useremail = downloader_params['username']
2069 password = downloader_params['password']
2070 elif downloader_params.get('usenetrc', False):
2072 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2073 if info is not None:
2077 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2078 except (IOError, netrc.NetrcParseError) as err:
2079 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
2082 if useremail is None:
2091 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2094 login_results = compat_urllib_request.urlopen(request).read()
2095 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2096 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2098 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2099 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
2102 def _real_extract(self, url):
2103 mobj = re.match(self._VALID_URL, url)
2105 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2107 video_id = mobj.group('ID')
2109 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2110 webpage = self._download_webpage(url, video_id)
2112 BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2113 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2114 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2116 raise ExtractorError(u'Cannot parse data')
2117 data = dict(json.loads(m.group(1)))
2118 params_raw = compat_urllib_parse.unquote(data['params'])
2119 params = json.loads(params_raw)
2120 video_url = params['hd_src']
2122 video_url = params['sd_src']
2124 raise ExtractorError(u'Cannot find video URL')
2125 video_duration = int(params['video_duration'])
2127 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2129 raise ExtractorError(u'Cannot find title in webpage')
2130 video_title = unescapeHTML(m.group(1))
2134 'title': video_title,
2137 'duration': video_duration,
2138 'thumbnail': params['thumbnail_src'],
2143 class BlipTVIE(InfoExtractor):
2144 """Information extractor for blip.tv"""
2146 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2147 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2148 IE_NAME = u'blip.tv'
2150 def report_extraction(self, file_id):
2151 """Report information extraction."""
2152 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2154 def report_direct_download(self, title):
2155 """Report information extraction."""
2156 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2158 def _real_extract(self, url):
2159 mobj = re.match(self._VALID_URL, url)
2161 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2164 urlp = compat_urllib_parse_urlparse(url)
2165 if urlp.path.startswith('/play/'):
2166 request = compat_urllib_request.Request(url)
2167 response = compat_urllib_request.urlopen(request)
2168 redirecturl = response.geturl()
2169 rurlp = compat_urllib_parse_urlparse(redirecturl)
2170 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
2171 url = 'http://blip.tv/a/a-' + file_id
2172 return self._real_extract(url)
2179 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2180 request = compat_urllib_request.Request(json_url)
2181 request.add_header('User-Agent', 'iTunes/10.6.1')
2182 self.report_extraction(mobj.group(1))
2185 urlh = compat_urllib_request.urlopen(request)
2186 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2187 basename = url.split('/')[-1]
2188 title,ext = os.path.splitext(basename)
2189 title = title.decode('UTF-8')
2190 ext = ext.replace('.', '')
2191 self.report_direct_download(title)
2196 'upload_date': None,
2201 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2202 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2203 if info is None: # Regular URL
2205 json_code_bytes = urlh.read()
2206 json_code = json_code_bytes.decode('utf-8')
2207 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2208 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2212 json_data = json.loads(json_code)
2213 if 'Post' in json_data:
2214 data = json_data['Post']
2218 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2219 video_url = data['media']['url']
2220 umobj = re.match(self._URL_EXT, video_url)
2222 raise ValueError('Can not determine filename extension')
2223 ext = umobj.group(1)
2226 'id': data['item_id'],
2228 'uploader': data['display_name'],
2229 'upload_date': upload_date,
2230 'title': data['title'],
2232 'format': data['media']['mimeType'],
2233 'thumbnail': data['thumbnailUrl'],
2234 'description': data['description'],
2235 'player_url': data['embedUrl'],
2236 'user_agent': 'iTunes/10.6.1',
2238 except (ValueError,KeyError) as err:
2239 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2245 class MyVideoIE(InfoExtractor):
2246 """Information Extractor for myvideo.de."""
2248 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2249 IE_NAME = u'myvideo'
2251 def __init__(self, downloader=None):
2252 InfoExtractor.__init__(self, downloader)
2254 def report_extraction(self, video_id):
2255 """Report information extraction."""
2256 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2258 def _real_extract(self,url):
2259 mobj = re.match(self._VALID_URL, url)
2261 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2264 video_id = mobj.group(1)
2267 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2268 webpage = self._download_webpage(webpage_url, video_id)
2270 self.report_extraction(video_id)
2271 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
2274 self._downloader.trouble(u'ERROR: unable to extract media URL')
2276 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2278 mobj = re.search('<title>([^<]+)</title>', webpage)
2280 self._downloader.trouble(u'ERROR: unable to extract title')
2283 video_title = mobj.group(1)
2289 'upload_date': None,
2290 'title': video_title,
2294 class ComedyCentralIE(InfoExtractor):
2295 """Information extractor for The Daily Show and Colbert Report """
2297 # urls can be abbreviations like :thedailyshow or :colbert
2298 # urls for episodes like:
2299 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2300 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2301 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2302 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2303 |(https?://)?(www\.)?
2304 (?P<showname>thedailyshow|colbertnation)\.com/
2305 (full-episodes/(?P<episode>.*)|
2307 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2308 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2311 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2313 _video_extensions = {
2321 _video_dimensions = {
2331 def suitable(cls, url):
2332 """Receives a URL and returns True if suitable for this IE."""
2333 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2335 def report_extraction(self, episode_id):
2336 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2338 def report_config_download(self, episode_id, media_id):
2339 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2341 def report_index_download(self, episode_id):
2342 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2344 def _print_formats(self, formats):
2345 print('Available formats:')
2347 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2350 def _real_extract(self, url):
2351 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2353 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2356 if mobj.group('shortname'):
2357 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2358 url = u'http://www.thedailyshow.com/full-episodes/'
2360 url = u'http://www.colbertnation.com/full-episodes/'
2361 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2362 assert mobj is not None
2364 if mobj.group('clip'):
2365 if mobj.group('showname') == 'thedailyshow':
2366 epTitle = mobj.group('tdstitle')
2368 epTitle = mobj.group('cntitle')
2371 dlNewest = not mobj.group('episode')
2373 epTitle = mobj.group('showname')
2375 epTitle = mobj.group('episode')
2377 req = compat_urllib_request.Request(url)
2378 self.report_extraction(epTitle)
2380 htmlHandle = compat_urllib_request.urlopen(req)
2381 html = htmlHandle.read()
2382 webpage = html.decode('utf-8')
2383 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2384 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2387 url = htmlHandle.geturl()
2388 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2390 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2392 if mobj.group('episode') == '':
2393 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2395 epTitle = mobj.group('episode')
2397 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2399 if len(mMovieParams) == 0:
2400 # The Colbert Report embeds the information in a without
2401 # a URL prefix; so extract the alternate reference
2402 # and then add the URL prefix manually.
2404 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2405 if len(altMovieParams) == 0:
2406 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2409 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2411 uri = mMovieParams[0][1]
2412 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2413 self.report_index_download(epTitle)
2415 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2416 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2417 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2422 idoc = xml.etree.ElementTree.fromstring(indexXml)
2423 itemEls = idoc.findall('.//item')
2424 for partNum,itemEl in enumerate(itemEls):
2425 mediaId = itemEl.findall('./guid')[0].text
2426 shortMediaId = mediaId.split(':')[-1]
2427 showId = mediaId.split(':')[-2].replace('.com', '')
2428 officialTitle = itemEl.findall('./title')[0].text
2429 officialDate = itemEl.findall('./pubDate')[0].text
2431 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2432 compat_urllib_parse.urlencode({'uri': mediaId}))
2433 configReq = compat_urllib_request.Request(configUrl)
2434 self.report_config_download(epTitle, shortMediaId)
2436 configXml = compat_urllib_request.urlopen(configReq).read()
2437 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2438 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2441 cdoc = xml.etree.ElementTree.fromstring(configXml)
2443 for rendition in cdoc.findall('.//rendition'):
2444 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2448 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2451 if self._downloader.params.get('listformats', None):
2452 self._print_formats([i[0] for i in turls])
2455 # For now, just pick the highest bitrate
2456 format,rtmp_video_url = turls[-1]
2458 # Get the format arg from the arg stream
2459 req_format = self._downloader.params.get('format', None)
2461 # Select format if we can find one
2464 format, rtmp_video_url = f, v
2467 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2469 raise ExtractorError(u'Cannot transform RTMP url')
2470 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2471 video_url = base + m.group('finalid')
2473 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2478 'upload_date': officialDate,
2483 'description': officialTitle,
2485 results.append(info)
2490 class EscapistIE(InfoExtractor):
2491 """Information extractor for The Escapist """
2493 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2494 IE_NAME = u'escapist'
2496 def report_extraction(self, showName):
2497 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2499 def report_config_download(self, showName):
2500 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2502 def _real_extract(self, url):
2503 mobj = re.match(self._VALID_URL, url)
2505 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2507 showName = mobj.group('showname')
2508 videoId = mobj.group('episode')
2510 self.report_extraction(showName)
2512 webPage = compat_urllib_request.urlopen(url)
2513 webPageBytes = webPage.read()
2514 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2515 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2516 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2517 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2520 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2521 description = unescapeHTML(descMatch.group(1))
2522 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2523 imgUrl = unescapeHTML(imgMatch.group(1))
2524 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2525 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2526 configUrlMatch = re.search('config=(.*)$', playerUrl)
2527 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2529 self.report_config_download(showName)
2531 configJSON = compat_urllib_request.urlopen(configUrl)
2532 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2533 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2534 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2535 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2538 # Technically, it's JavaScript, not JSON
2539 configJSON = configJSON.replace("'", '"')
2542 config = json.loads(configJSON)
2543 except (ValueError,) as err:
2544 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2547 playlist = config['playlist']
2548 videoUrl = playlist[1]['url']
2553 'uploader': showName,
2554 'upload_date': None,
2557 'thumbnail': imgUrl,
2558 'description': description,
2559 'player_url': playerUrl,
2564 class CollegeHumorIE(InfoExtractor):
2565 """Information extractor for collegehumor.com"""
2568 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2569 IE_NAME = u'collegehumor'
2571 def report_manifest(self, video_id):
2572 """Report information extraction."""
2573 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2575 def report_extraction(self, video_id):
2576 """Report information extraction."""
2577 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2579 def _real_extract(self, url):
2580 mobj = re.match(self._VALID_URL, url)
2582 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2584 video_id = mobj.group('videoid')
2589 'upload_date': None,
2592 self.report_extraction(video_id)
2593 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2595 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2596 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2597 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2600 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2602 videoNode = mdoc.findall('./video')[0]
2603 info['description'] = videoNode.findall('./description')[0].text
2604 info['title'] = videoNode.findall('./caption')[0].text
2605 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2606 manifest_url = videoNode.findall('./file')[0].text
2608 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2611 manifest_url += '?hdcore=2.10.3'
2612 self.report_manifest(video_id)
2614 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2615 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2616 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2619 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2621 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2622 node_id = media_node.attrib['url']
2623 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2624 except IndexError as err:
2625 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2628 url_pr = compat_urllib_parse_urlparse(manifest_url)
2629 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2636 class XVideosIE(InfoExtractor):
2637 """Information extractor for xvideos.com"""
2639 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2640 IE_NAME = u'xvideos'
2642 def report_extraction(self, video_id):
2643 """Report information extraction."""
2644 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2646 def _real_extract(self, url):
2647 mobj = re.match(self._VALID_URL, url)
2649 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2651 video_id = mobj.group(1)
2653 webpage = self._download_webpage(url, video_id)
2655 self.report_extraction(video_id)
2659 mobj = re.search(r'flv_url=(.+?)&', webpage)
2661 self._downloader.trouble(u'ERROR: unable to extract video url')
2663 video_url = compat_urllib_parse.unquote(mobj.group(1))
2667 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2669 self._downloader.trouble(u'ERROR: unable to extract video title')
2671 video_title = mobj.group(1)
2674 # Extract video thumbnail
2675 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2677 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2679 video_thumbnail = mobj.group(0)
2685 'upload_date': None,
2686 'title': video_title,
2688 'thumbnail': video_thumbnail,
2689 'description': None,
2695 class SoundcloudIE(InfoExtractor):
2696 """Information extractor for soundcloud.com
2697 To access the media, the uid of the song and a stream token
2698 must be extracted from the page source and the script must make
2699 a request to media.soundcloud.com/crossdomain.xml. Then
2700 the media can be grabbed by requesting from an url composed
2701 of the stream token and uid
2704 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2705 IE_NAME = u'soundcloud'
2707 def __init__(self, downloader=None):
2708 InfoExtractor.__init__(self, downloader)
2710 def report_resolve(self, video_id):
2711 """Report information extraction."""
2712 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2714 def report_extraction(self, video_id):
2715 """Report information extraction."""
2716 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2718 def _real_extract(self, url):
2719 mobj = re.match(self._VALID_URL, url)
2721 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2724 # extract uploader (which is in the url)
2725 uploader = mobj.group(1)
2726 # extract simple title (uploader + slug of song title)
2727 slug_title = mobj.group(2)
2728 simple_title = uploader + u'-' + slug_title
2730 self.report_resolve('%s/%s' % (uploader, slug_title))
2732 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2733 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2734 request = compat_urllib_request.Request(resolv_url)
2736 info_json_bytes = compat_urllib_request.urlopen(request).read()
2737 info_json = info_json_bytes.decode('utf-8')
2738 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2739 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2742 info = json.loads(info_json)
2743 video_id = info['id']
2744 self.report_extraction('%s/%s' % (uploader, slug_title))
2746 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2747 request = compat_urllib_request.Request(streams_url)
2749 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2750 stream_json = stream_json_bytes.decode('utf-8')
2751 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2752 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2755 streams = json.loads(stream_json)
2756 mediaURL = streams['http_mp3_128_url']
2761 'uploader': info['user']['username'],
2762 'upload_date': info['created_at'],
2763 'title': info['title'],
2765 'description': info['description'],
2769 class InfoQIE(InfoExtractor):
2770 """Information extractor for infoq.com"""
2771 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2773 def report_extraction(self, video_id):
2774 """Report information extraction."""
2775 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2777 def _real_extract(self, url):
2778 mobj = re.match(self._VALID_URL, url)
2780 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2783 webpage = self._download_webpage(url, video_id=url)
2784 self.report_extraction(url)
2787 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2789 self._downloader.trouble(u'ERROR: unable to extract video url')
2791 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2792 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2795 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2797 self._downloader.trouble(u'ERROR: unable to extract video title')
2799 video_title = mobj.group(1)
2801 # Extract description
2802 video_description = u'No description available.'
2803 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2804 if mobj is not None:
2805 video_description = mobj.group(1)
2807 video_filename = video_url.split('/')[-1]
2808 video_id, extension = video_filename.split('.')
2814 'upload_date': None,
2815 'title': video_title,
2816 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2818 'description': video_description,
2823 class MixcloudIE(InfoExtractor):
2824 """Information extractor for www.mixcloud.com"""
2826 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2827 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2828 IE_NAME = u'mixcloud'
2830 def __init__(self, downloader=None):
2831 InfoExtractor.__init__(self, downloader)
2833 def report_download_json(self, file_id):
2834 """Report JSON download."""
2835 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2837 def report_extraction(self, file_id):
2838 """Report information extraction."""
2839 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2841 def get_urls(self, jsonData, fmt, bitrate='best'):
2842 """Get urls from 'audio_formats' section in json"""
2845 bitrate_list = jsonData[fmt]
2846 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2847 bitrate = max(bitrate_list) # select highest
2849 url_list = jsonData[fmt][bitrate]
2850 except TypeError: # we have no bitrate info.
2851 url_list = jsonData[fmt]
2854 def check_urls(self, url_list):
2855 """Returns 1st active url from list"""
2856 for url in url_list:
2858 compat_urllib_request.urlopen(url)
2860 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2865 def _print_formats(self, formats):
2866 print('Available formats:')
2867 for fmt in formats.keys():
2868 for b in formats[fmt]:
2870 ext = formats[fmt][b][0]
2871 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2872 except TypeError: # we have no bitrate info
2873 ext = formats[fmt][0]
2874 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2877 def _real_extract(self, url):
2878 mobj = re.match(self._VALID_URL, url)
2880 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2882 # extract uploader & filename from url
2883 uploader = mobj.group(1).decode('utf-8')
2884 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2886 # construct API request
2887 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2888 # retrieve .json file with links to files
2889 request = compat_urllib_request.Request(file_url)
2891 self.report_download_json(file_url)
2892 jsonData = compat_urllib_request.urlopen(request).read()
2893 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2894 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2898 json_data = json.loads(jsonData)
2899 player_url = json_data['player_swf_url']
2900 formats = dict(json_data['audio_formats'])
2902 req_format = self._downloader.params.get('format', None)
2905 if self._downloader.params.get('listformats', None):
2906 self._print_formats(formats)
2909 if req_format is None or req_format == 'best':
2910 for format_param in formats.keys():
2911 url_list = self.get_urls(formats, format_param)
2913 file_url = self.check_urls(url_list)
2914 if file_url is not None:
2917 if req_format not in formats:
2918 self._downloader.trouble(u'ERROR: format is not available')
2921 url_list = self.get_urls(formats, req_format)
2922 file_url = self.check_urls(url_list)
2923 format_param = req_format
2926 'id': file_id.decode('utf-8'),
2927 'url': file_url.decode('utf-8'),
2928 'uploader': uploader.decode('utf-8'),
2929 'upload_date': None,
2930 'title': json_data['name'],
2931 'ext': file_url.split('.')[-1].decode('utf-8'),
2932 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2933 'thumbnail': json_data['thumbnail_url'],
2934 'description': json_data['description'],
2935 'player_url': player_url.decode('utf-8'),
2938 class StanfordOpenClassroomIE(InfoExtractor):
2939 """Information extractor for Stanford's Open ClassRoom"""
2941 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2942 IE_NAME = u'stanfordoc'
2944 def report_download_webpage(self, objid):
2945 """Report information extraction."""
2946 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2948 def report_extraction(self, video_id):
2949 """Report information extraction."""
2950 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2952 def _real_extract(self, url):
2953 mobj = re.match(self._VALID_URL, url)
2955 raise ExtractorError(u'Invalid URL: %s' % url)
2957 if mobj.group('course') and mobj.group('video'): # A specific video
2958 course = mobj.group('course')
2959 video = mobj.group('video')
2961 'id': course + '_' + video,
2963 'upload_date': None,
2966 self.report_extraction(info['id'])
2967 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2968 xmlUrl = baseUrl + video + '.xml'
2970 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2971 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2972 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2974 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2976 info['title'] = mdoc.findall('./title')[0].text
2977 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2979 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2981 info['ext'] = info['url'].rpartition('.')[2]
2983 elif mobj.group('course'): # A course page
2984 course = mobj.group('course')
2989 'upload_date': None,
2992 coursepage = self._download_webpage(url, info['id'],
2993 note='Downloading course info page',
2994 errnote='Unable to download course info page')
2996 m = re.search('<h1>([^<]+)</h1>', coursepage)
2998 info['title'] = unescapeHTML(m.group(1))
3000 info['title'] = info['id']
3002 m = re.search('<description>([^<]+)</description>', coursepage)
3004 info['description'] = unescapeHTML(m.group(1))
3006 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
3009 'type': 'reference',
3010 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
3014 for entry in info['list']:
3015 assert entry['type'] == 'reference'
3016 results += self.extract(entry['url'])
3020 'id': 'Stanford OpenClassroom',
3023 'upload_date': None,
3026 self.report_download_webpage(info['id'])
3027 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3029 rootpage = compat_urllib_request.urlopen(rootURL).read()
3030 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3031 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3034 info['title'] = info['id']
3036 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3039 'type': 'reference',
3040 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3045 for entry in info['list']:
3046 assert entry['type'] == 'reference'
3047 results += self.extract(entry['url'])
3050 class MTVIE(InfoExtractor):
3051 """Information extractor for MTV.com"""
3053 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3056 def report_extraction(self, video_id):
3057 """Report information extraction."""
3058 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3060 def _real_extract(self, url):
3061 mobj = re.match(self._VALID_URL, url)
3063 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3065 if not mobj.group('proto'):
3066 url = 'http://' + url
3067 video_id = mobj.group('videoid')
3069 webpage = self._download_webpage(url, video_id)
3071 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3073 self._downloader.trouble(u'ERROR: unable to extract song name')
3075 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3076 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3078 self._downloader.trouble(u'ERROR: unable to extract performer')
3080 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3081 video_title = performer + ' - ' + song_name
3083 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3085 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3087 mtvn_uri = mobj.group(1)
3089 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3091 self._downloader.trouble(u'ERROR: unable to extract content id')
3093 content_id = mobj.group(1)
3095 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3096 self.report_extraction(video_id)
3097 request = compat_urllib_request.Request(videogen_url)
3099 metadataXml = compat_urllib_request.urlopen(request).read()
3100 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3101 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3104 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3105 renditions = mdoc.findall('.//rendition')
3107 # For now, always pick the highest quality.
3108 rendition = renditions[-1]
3111 _,_,ext = rendition.attrib['type'].partition('/')
3112 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3113 video_url = rendition.find('./src').text
3115 self._downloader.trouble('Invalid rendition field.')
3121 'uploader': performer,
3122 'upload_date': None,
3123 'title': video_title,
3131 class YoukuIE(InfoExtractor):
3132 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3134 def report_download_webpage(self, file_id):
3135 """Report webpage download."""
3136 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3138 def report_extraction(self, file_id):
3139 """Report information extraction."""
3140 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3143 nowTime = int(time.time() * 1000)
3144 random1 = random.randint(1000,1998)
3145 random2 = random.randint(1000,9999)
3147 return "%d%d%d" %(nowTime,random1,random2)
3149 def _get_file_ID_mix_string(self, seed):
3151 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3153 for i in range(len(source)):
3154 seed = (seed * 211 + 30031 ) % 65536
3155 index = math.floor(seed / 65536 * len(source) )
3156 mixed.append(source[int(index)])
3157 source.remove(source[int(index)])
3158 #return ''.join(mixed)
3161 def _get_file_id(self, fileId, seed):
3162 mixed = self._get_file_ID_mix_string(seed)
3163 ids = fileId.split('*')
3167 realId.append(mixed[int(ch)])
3168 return ''.join(realId)
3170 def _real_extract(self, url):
3171 mobj = re.match(self._VALID_URL, url)
3173 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3175 video_id = mobj.group('ID')
3177 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3179 request = compat_urllib_request.Request(info_url, None, std_headers)
3181 self.report_download_webpage(video_id)
3182 jsondata = compat_urllib_request.urlopen(request).read()
3183 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3184 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3187 self.report_extraction(video_id)
3189 jsonstr = jsondata.decode('utf-8')
3190 config = json.loads(jsonstr)
3192 video_title = config['data'][0]['title']
3193 seed = config['data'][0]['seed']
3195 format = self._downloader.params.get('format', None)
3196 supported_format = list(config['data'][0]['streamfileids'].keys())
3198 if format is None or format == 'best':
3199 if 'hd2' in supported_format:
3204 elif format == 'worst':
3212 fileid = config['data'][0]['streamfileids'][format]
3213 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3214 except (UnicodeDecodeError, ValueError, KeyError):
3215 self._downloader.trouble(u'ERROR: unable to extract info section')
3219 sid = self._gen_sid()
3220 fileid = self._get_file_id(fileid, seed)
3222 #column 8,9 of fileid represent the segment number
3223 #fileid[7:9] should be changed
3224 for index, key in enumerate(keys):
3226 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3227 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3230 'id': '%s_part%02d' % (video_id, index),
3231 'url': download_url,
3233 'upload_date': None,
3234 'title': video_title,
3237 files_info.append(info)
3242 class XNXXIE(InfoExtractor):
3243 """Information extractor for xnxx.com"""
3245 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3247 VIDEO_URL_RE = r'flv_url=(.*?)&'
3248 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3249 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3251 def report_webpage(self, video_id):
3252 """Report information extraction"""
3253 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3255 def report_extraction(self, video_id):
3256 """Report information extraction"""
3257 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3259 def _real_extract(self, url):
3260 mobj = re.match(self._VALID_URL, url)
3262 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3264 video_id = mobj.group(1)
3266 self.report_webpage(video_id)
3268 # Get webpage content
3270 webpage_bytes = compat_urllib_request.urlopen(url).read()
3271 webpage = webpage_bytes.decode('utf-8')
3272 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3273 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3276 result = re.search(self.VIDEO_URL_RE, webpage)
3278 self._downloader.trouble(u'ERROR: unable to extract video url')
3280 video_url = compat_urllib_parse.unquote(result.group(1))
3282 result = re.search(self.VIDEO_TITLE_RE, webpage)
3284 self._downloader.trouble(u'ERROR: unable to extract video title')
3286 video_title = result.group(1)
3288 result = re.search(self.VIDEO_THUMB_RE, webpage)
3290 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3292 video_thumbnail = result.group(1)
3298 'upload_date': None,
3299 'title': video_title,
3301 'thumbnail': video_thumbnail,
3302 'description': None,
3306 class GooglePlusIE(InfoExtractor):
3307 """Information extractor for plus.google.com."""
3309 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3310 IE_NAME = u'plus.google'
3312 def __init__(self, downloader=None):
3313 InfoExtractor.__init__(self, downloader)
3315 def report_extract_entry(self, url):
3316 """Report downloading extry"""
3317 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3319 def report_date(self, upload_date):
3320 """Report downloading extry"""
3321 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3323 def report_uploader(self, uploader):
3324 """Report downloading extry"""
3325 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3327 def report_title(self, video_title):
3328 """Report downloading extry"""
3329 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3331 def report_extract_vid_page(self, video_page):
3332 """Report information extraction."""
3333 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3335 def _real_extract(self, url):
3336 # Extract id from URL
3337 mobj = re.match(self._VALID_URL, url)
3339 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3342 post_url = mobj.group(0)
3343 video_id = mobj.group(1)
3345 video_extension = 'flv'
3347 # Step 1, Retrieve post webpage to extract further information
3348 self.report_extract_entry(post_url)
3349 request = compat_urllib_request.Request(post_url)
3351 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3352 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3353 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3356 # Extract update date
3358 pattern = 'title="Timestamp">(.*?)</a>'
3359 mobj = re.search(pattern, webpage)
3361 upload_date = mobj.group(1)
3362 # Convert timestring to a format suitable for filename
3363 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3364 upload_date = upload_date.strftime('%Y%m%d')
3365 self.report_date(upload_date)
3369 pattern = r'rel\="author".*?>(.*?)</a>'
3370 mobj = re.search(pattern, webpage)
3372 uploader = mobj.group(1)
3373 self.report_uploader(uploader)
3376 # Get the first line for title
3378 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3379 mobj = re.search(pattern, webpage)
3381 video_title = mobj.group(1)
3382 self.report_title(video_title)
3384 # Step 2, Stimulate clicking the image box to launch video
3385 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3386 mobj = re.search(pattern, webpage)
3388 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3390 video_page = mobj.group(1)
3391 request = compat_urllib_request.Request(video_page)
3393 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3394 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3395 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3397 self.report_extract_vid_page(video_page)
3400 # Extract video links on video page
3401 """Extract video links of all sizes"""
3402 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3403 mobj = re.findall(pattern, webpage)
3405 self._downloader.trouble(u'ERROR: unable to extract video links')
3407 # Sort in resolution
3408 links = sorted(mobj)
3410 # Choose the lowest of the sort, i.e. highest resolution
3411 video_url = links[-1]
3412 # Only get the url. The resolution part in the tuple has no use anymore
3413 video_url = video_url[-1]
3414 # Treat escaped \u0026 style hex
3416 video_url = video_url.decode("unicode_escape")
3417 except AttributeError: # Python 3
3418 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3424 'uploader': uploader,
3425 'upload_date': upload_date,
3426 'title': video_title,
3427 'ext': video_extension,
3430 class NBAIE(InfoExtractor):
3431 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3434 def _real_extract(self, url):
3435 mobj = re.match(self._VALID_URL, url)
3437 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3440 video_id = mobj.group(1)
3441 if video_id.endswith('/index.html'):
3442 video_id = video_id[:-len('/index.html')]
3444 webpage = self._download_webpage(url, video_id)
3446 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3447 def _findProp(rexp, default=None):
3448 m = re.search(rexp, webpage)
3450 return unescapeHTML(m.group(1))
3454 shortened_video_id = video_id.rpartition('/')[2]
3455 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3457 'id': shortened_video_id,
3461 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3462 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3466 class JustinTVIE(InfoExtractor):
3467 """Information extractor for justin.tv and twitch.tv"""
3468 # TODO: One broadcast may be split into multiple videos. The key
3469 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3470 # starts at 1 and increases. Can we treat all parts as one video?
3472 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3473 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3474 _JUSTIN_PAGE_LIMIT = 100
3475 IE_NAME = u'justin.tv'
3477 def report_extraction(self, file_id):
3478 """Report information extraction."""
3479 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3481 def report_download_page(self, channel, offset):
3482 """Report attempt to download a single page of videos."""
3483 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3484 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3486 # Return count of items, list of *valid* items
3487 def _parse_page(self, url):
3489 urlh = compat_urllib_request.urlopen(url)
3490 webpage_bytes = urlh.read()
3491 webpage = webpage_bytes.decode('utf-8', 'ignore')
3492 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3493 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3496 response = json.loads(webpage)
3497 if type(response) != list:
3498 error_text = response.get('error', 'unknown error')
3499 self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3502 for clip in response:
3503 video_url = clip['video_file_url']
3505 video_extension = os.path.splitext(video_url)[1][1:]
3506 video_date = re.sub('-', '', clip['start_time'][:10])
3507 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3508 video_id = clip['id']
3509 video_title = clip.get('title', video_id)
3513 'title': video_title,
3514 'uploader': clip.get('channel_name', video_uploader_id),
3515 'uploader_id': video_uploader_id,
3516 'upload_date': video_date,
3517 'ext': video_extension,
3519 return (len(response), info)
3521 def _real_extract(self, url):
3522 mobj = re.match(self._VALID_URL, url)
3524 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3527 api = 'http://api.justin.tv'
3528 video_id = mobj.group(mobj.lastindex)
3530 if mobj.lastindex == 1:
3532 api += '/channel/archives/%s.json'
3534 api += '/broadcast/by_archive/%s.json'
3535 api = api % (video_id,)
3537 self.report_extraction(video_id)
3541 limit = self._JUSTIN_PAGE_LIMIT
3544 self.report_download_page(video_id, offset)
3545 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3546 page_count, page_info = self._parse_page(page_url)
3547 info.extend(page_info)
3548 if not paged or page_count != limit:
3553 class FunnyOrDieIE(InfoExtractor):
3554 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3556 def _real_extract(self, url):
3557 mobj = re.match(self._VALID_URL, url)
3559 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3562 video_id = mobj.group('id')
3563 webpage = self._download_webpage(url, video_id)
3565 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3567 self._downloader.trouble(u'ERROR: unable to find video information')
3568 video_url = unescapeHTML(m.group('url'))
3570 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3572 self._downloader.trouble(u'Cannot find video title')
3573 title = unescapeHTML(m.group('title'))
3575 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3577 desc = unescapeHTML(m.group('desc'))
3586 'description': desc,
3590 class SteamIE(InfoExtractor):
3591 _VALID_URL = r"""http://store.steampowered.com/
3592 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3594 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3598 def suitable(cls, url):
3599 """Receives a URL and returns True if suitable for this IE."""
3600 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3602 def _real_extract(self, url):
3603 m = re.match(self._VALID_URL, url, re.VERBOSE)
3604 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3605 gameID = m.group('gameID')
3606 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3607 webpage = self._download_webpage(videourl, gameID)
3608 mweb = re.finditer(urlRE, webpage)
3609 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3610 titles = re.finditer(namesRE, webpage)
3611 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3612 thumbs = re.finditer(thumbsRE, webpage)
3614 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3615 video_id = vid.group('videoID')
3616 title = vtitle.group('videoName')
3617 video_url = vid.group('videoURL')
3618 video_thumb = thumb.group('thumbnail')
3620 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3625 'title': unescapeHTML(title),
3626 'thumbnail': video_thumb
3631 class UstreamIE(InfoExtractor):
3632 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3633 IE_NAME = u'ustream'
3635 def _real_extract(self, url):
3636 m = re.match(self._VALID_URL, url)
3637 video_id = m.group('videoID')
3638 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3639 webpage = self._download_webpage(url, video_id)
3640 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3641 title = m.group('title')
3642 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3643 uploader = m.group('uploader')
3649 'uploader': uploader
3653 class RBMARadioIE(InfoExtractor):
3654 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3656 def _real_extract(self, url):
3657 m = re.match(self._VALID_URL, url)
3658 video_id = m.group('videoID')
3660 webpage = self._download_webpage(url, video_id)
3661 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3663 raise ExtractorError(u'Cannot find metadata')
3664 json_data = m.group(1)
3667 data = json.loads(json_data)
3668 except ValueError as e:
3669 raise ExtractorError(u'Invalid JSON: ' + str(e))
3671 video_url = data['akamai_url'] + '&cbr=256'
3672 url_parts = compat_urllib_parse_urlparse(video_url)
3673 video_ext = url_parts.path.rpartition('.')[2]
3678 'title': data['title'],
3679 'description': data.get('teaser_text'),
3680 'location': data.get('country_of_origin'),
3681 'uploader': data.get('host', {}).get('name'),
3682 'uploader_id': data.get('host', {}).get('slug'),
3683 'thumbnail': data.get('image', {}).get('large_url_2x'),
3684 'duration': data.get('duration'),
3689 class YouPornIE(InfoExtractor):
3690 """Information extractor for youporn.com."""
3691 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3693 def _print_formats(self, formats):
3694 """Print all available formats"""
3695 print(u'Available formats:')
3696 print(u'ext\t\tformat')
3697 print(u'---------------------------------')
3698 for format in formats:
3699 print(u'%s\t\t%s' % (format['ext'], format['format']))
3701 def _specific(self, req_format, formats):
3703 if(x["format"]==req_format):
3707 def _real_extract(self, url):
3708 mobj = re.match(self._VALID_URL, url)
3710 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3713 video_id = mobj.group('videoid')
3715 req = compat_urllib_request.Request(url)
3716 req.add_header('Cookie', 'age_verified=1')
3717 webpage = self._download_webpage(req, video_id)
3719 # Get the video title
3720 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3722 raise ExtractorError(u'Unable to extract video title')
3723 video_title = result.group('title').strip()
3725 # Get the video date
3726 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3728 self._downloader.report_warning(u'unable to extract video date')
3731 upload_date = result.group('date').strip()
3733 # Get the video uploader
3734 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3736 self._downloader.report_warning(u'unable to extract uploader')
3737 video_uploader = None
3739 video_uploader = result.group('uploader').strip()
3740 video_uploader = clean_html( video_uploader )
3742 # Get all of the formats available
3743 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3744 result = re.search(DOWNLOAD_LIST_RE, webpage)
3746 raise ExtractorError(u'Unable to extract download list')
3747 download_list_html = result.group('download_list').strip()
3749 # Get all of the links from the page
3750 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3751 links = re.findall(LINK_RE, download_list_html)
3752 if(len(links) == 0):
3753 raise ExtractorError(u'ERROR: no known formats available for video')
3755 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3760 # A link looks like this:
3761 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3762 # A path looks like this:
3763 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3764 video_url = unescapeHTML( link )
3765 path = compat_urllib_parse_urlparse( video_url ).path
3766 extension = os.path.splitext( path )[1][1:]
3767 format = path.split('/')[4].split('_')[:2]
3770 format = "-".join( format )
3771 title = u'%s-%s-%s' % (video_title, size, bitrate)
3776 'uploader': video_uploader,
3777 'upload_date': upload_date,
3782 'description': None,
3786 if self._downloader.params.get('listformats', None):
3787 self._print_formats(formats)
3790 req_format = self._downloader.params.get('format', None)
3791 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3793 if req_format is None or req_format == 'best':
3795 elif req_format == 'worst':
3796 return [formats[-1]]
3797 elif req_format in ('-1', 'all'):
3800 format = self._specific( req_format, formats )
3802 self._downloader.trouble(u'ERROR: requested format not available')
3808 class PornotubeIE(InfoExtractor):
3809 """Information extractor for pornotube.com."""
3810 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3812 def _real_extract(self, url):
3813 mobj = re.match(self._VALID_URL, url)
3815 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3818 video_id = mobj.group('videoid')
3819 video_title = mobj.group('title')
3821 # Get webpage content
3822 webpage = self._download_webpage(url, video_id)
3825 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3826 result = re.search(VIDEO_URL_RE, webpage)
3828 self._downloader.trouble(u'ERROR: unable to extract video url')
3830 video_url = compat_urllib_parse.unquote(result.group('url'))
3832 #Get the uploaded date
3833 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3834 result = re.search(VIDEO_UPLOADED_RE, webpage)
3836 self._downloader.trouble(u'ERROR: unable to extract video title')
3838 upload_date = result.group('date')
3840 info = {'id': video_id,
3843 'upload_date': upload_date,
3844 'title': video_title,
3850 class YouJizzIE(InfoExtractor):
3851 """Information extractor for youjizz.com."""
3852 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3854 def _real_extract(self, url):
3855 mobj = re.match(self._VALID_URL, url)
3857 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3860 video_id = mobj.group('videoid')
3862 # Get webpage content
3863 webpage = self._download_webpage(url, video_id)
3865 # Get the video title
3866 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3868 raise ExtractorError(u'ERROR: unable to extract video title')
3869 video_title = result.group('title').strip()
3871 # Get the embed page
3872 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3874 raise ExtractorError(u'ERROR: unable to extract embed page')
3876 embed_page_url = result.group(0).strip()
3877 video_id = result.group('videoid')
3879 webpage = self._download_webpage(embed_page_url, video_id)
3882 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3884 raise ExtractorError(u'ERROR: unable to extract video url')
3885 video_url = result.group('source')
3887 info = {'id': video_id,
3889 'title': video_title,
3892 'player_url': embed_page_url}
3896 class EightTracksIE(InfoExtractor):
3898 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3900 def _real_extract(self, url):
3901 mobj = re.match(self._VALID_URL, url)
3903 raise ExtractorError(u'Invalid URL: %s' % url)
3904 playlist_id = mobj.group('id')
3906 webpage = self._download_webpage(url, playlist_id)
3908 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3910 raise ExtractorError(u'Cannot find trax information')
3911 json_like = m.group(1)
3912 data = json.loads(json_like)
3914 session = str(random.randint(0, 1000000000))
3916 track_count = data['tracks_count']
3917 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3918 next_url = first_url
3920 for i in itertools.count():
3921 api_json = self._download_webpage(next_url, playlist_id,
3922 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3923 errnote=u'Failed to download song information')
3924 api_data = json.loads(api_json)
3925 track_data = api_data[u'set']['track']
3927 'id': track_data['id'],
3928 'url': track_data['track_file_stream_url'],
3929 'title': track_data['performer'] + u' - ' + track_data['name'],
3930 'raw_title': track_data['name'],
3931 'uploader_id': data['user']['login'],
3935 if api_data['set']['at_last_track']:
3937 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3940 class KeekIE(InfoExtractor):
3941 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3944 def _real_extract(self, url):
3945 m = re.match(self._VALID_URL, url)
3946 video_id = m.group('videoID')
3947 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3948 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3949 webpage = self._download_webpage(url, video_id)
3950 m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
3951 title = unescapeHTML(m.group('title'))
3952 m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
3953 uploader = unescapeHTML(m.group('uploader'))
3959 'thumbnail': thumbnail,
3960 'uploader': uploader
3964 class TEDIE(InfoExtractor):
3965 _VALID_URL=r'''http://www.ted.com/
3967 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3969 ((?P<type_talk>talks)) # We have a simple talk
3971 /(?P<name>\w+) # Here goes the name and then ".html"
3975 def suitable(cls, url):
3976 """Receives a URL and returns True if suitable for this IE."""
3977 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
3979 def _real_extract(self, url):
3980 m=re.match(self._VALID_URL, url, re.VERBOSE)
3981 if m.group('type_talk'):
3982 return [self._talk_info(url)]
3984 playlist_id=m.group('playlist_id')
3985 name=m.group('name')
3986 self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
3987 return self._playlist_videos_info(url,name,playlist_id)
3989 def _talk_video_link(self,mediaSlug):
3990 '''Returns the video link for that mediaSlug'''
3991 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3993 def _playlist_videos_info(self,url,name,playlist_id=0):
3994 '''Returns the videos of the playlist'''
3996 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
3997 ([.\s]*?)data-playlist_item_id="(\d+)"
3998 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4000 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4001 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4002 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4003 m_names=re.finditer(video_name_RE,webpage)
4005 for m_video, m_name in zip(m_videos,m_names):
4006 video_id=m_video.group('video_id')
4007 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4008 info.append(self._talk_info(talk_url,video_id))
4011 def _talk_info(self, url, video_id=0):
4012 """Return the video for the talk in the url"""
4013 m=re.match(self._VALID_URL, url,re.VERBOSE)
4014 videoName=m.group('name')
4015 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4016 # If the url includes the language we get the title translated
4017 title_RE=r'<h1><span id="altHeadline" >(?P<title>.*)</span></h1>'
4018 title=re.search(title_RE, webpage).group('title')
4019 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4020 "id":(?P<videoID>[\d]+).*?
4021 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4022 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4023 thumb_match=re.search(thumb_RE,webpage)
4024 info_match=re.search(info_RE,webpage,re.VERBOSE)
4025 video_id=info_match.group('videoID')
4026 mediaSlug=info_match.group('mediaSlug')
4027 video_url=self._talk_video_link(mediaSlug)
4033 'thumbnail': thumb_match.group('thumbnail')
4037 class MySpassIE(InfoExtractor):
4038 _VALID_URL = r'http://www.myspass.de/.*'
4040 def _real_extract(self, url):
4041 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4043 # video id is the last path element of the URL
4044 # usually there is a trailing slash, so also try the second but last
4045 url_path = compat_urllib_parse_urlparse(url).path
4046 url_parent_path, video_id = os.path.split(url_path)
4048 _, video_id = os.path.split(url_parent_path)
4051 metadata_url = META_DATA_URL_TEMPLATE % video_id
4052 metadata_text = self._download_webpage(metadata_url, video_id)
4053 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4055 # extract values from metadata
4056 url_flv_el = metadata.find('url_flv')
4057 if url_flv_el is None:
4058 self._downloader.trouble(u'ERROR: unable to extract download url')
4060 video_url = url_flv_el.text
4061 extension = os.path.splitext(video_url)[1][1:]
4062 title_el = metadata.find('title')
4063 if title_el is None:
4064 self._downloader.trouble(u'ERROR: unable to extract title')
4066 title = title_el.text
4067 format_id_el = metadata.find('format_id')
4068 if format_id_el is None:
4071 format = format_id_el.text
4072 description_el = metadata.find('description')
4073 if description_el is not None:
4074 description = description_el.text
4077 imagePreview_el = metadata.find('imagePreview')
4078 if imagePreview_el is not None:
4079 thumbnail = imagePreview_el.text
4088 'thumbnail': thumbnail,
4089 'description': description
4093 def gen_extractors():
4094 """ Return a list of an instance of every supported extractor.
4095 The order does matter; the first extractor matched is the one handling the URL.
4098 YoutubePlaylistIE(),
4122 StanfordOpenClassroomIE(),