2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
22 class InfoExtractor(object):
23 """Information Extractor class.
25 Information extractors are the classes that, given a URL, extract
26 information about the video (or videos) the URL refers to. This
27 information includes the real video URL, the video title, author and
28 others. The information is stored in a dictionary which is then
29 passed to the FileDownloader. The FileDownloader processes this
30 information possibly downloading the video to the file system, among
31 other possible outcomes.
33 The dictionaries must include the following fields:
37 title: Video title, unescaped.
38 ext: Video filename extension.
40 The following fields are optional:
42 format: The video format, defaults to ext (used for --get-format)
43 thumbnail: Full URL to a video thumbnail image.
44 description: One-line video description.
45 uploader: Full name of the video uploader.
46 upload_date: Video upload date (YYYYMMDD).
47 uploader_id: Nickname or id of the video uploader.
48 location: Physical location of the video.
49 player_url: SWF Player URL (used for rtmpdump).
50 subtitles: The .srt file contents.
51 urlhandle: [internal] The urlHandle to be used to download the file,
52 like returned by urllib.request.urlopen
54 The fields should all be Unicode strings.
56 Subclasses of this one should re-define the _real_initialize() and
57 _real_extract() methods and define a _VALID_URL regexp.
58 Probably, they should also be added to the list of extractors.
60 _real_extract() must return a *list* of information dictionaries as
63 Finally, the _WORKING attribute should be set to False for broken IEs
64 in order to warn the users and skip the tests.
71 def __init__(self, downloader=None):
72 """Constructor. Receives an optional downloader."""
74 self.set_downloader(downloader)
76 def suitable(self, url):
77 """Receives a URL and returns True if suitable for this IE."""
78 return re.match(self._VALID_URL, url) is not None
81 """Getter method for _WORKING."""
85 """Initializes an instance (authentication, etc)."""
87 self._real_initialize()
90 def extract(self, url):
91 """Extracts URL information and returns it in list of dicts."""
93 return self._real_extract(url)
95 def set_downloader(self, downloader):
96 """Sets the downloader for this IE."""
97 self._downloader = downloader
99 def _real_initialize(self):
100 """Real initialization process. Redefine in subclasses."""
103 def _real_extract(self, url):
104 """Real extraction process. Redefine in subclasses."""
109 return type(self).__name__[:-2]
111 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
112 """ Returns the response handle """
114 note = u'Downloading video webpage'
115 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
117 return compat_urllib_request.urlopen(url_or_request)
118 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
120 errnote = u'Unable to download webpage'
121 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
123 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
124 """ Returns the data of the page as a string """
125 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
126 webpage_bytes = urlh.read()
127 return webpage_bytes.decode('utf-8', 'replace')
130 class YoutubeIE(InfoExtractor):
131 """Information extractor for youtube.com."""
135 (?:https?://)? # http(s):// (optional)
136 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
137 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
138 (?:.*?\#/)? # handle anchor (#/) redirect urls
139 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
140 (?: # the various things that can precede the ID:
141 (?:(?:v|embed|e)/) # v/ or embed/ or e/
142 |(?: # or the v= param in all its forms
143 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
144 (?:\?|\#!?) # the params delimiter ? or # or #!
145 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
148 )? # optional -> youtube.com/xxxx is OK
149 )? # all until now is optional -> you can pass the naked ID
150 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
151 (?(1).+)? # if we found the ID, everything can follow
153 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
154 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
155 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
156 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
157 _NETRC_MACHINE = 'youtube'
158 # Listed in order of quality
159 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
160 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
161 _video_extensions = {
167 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
173 _video_dimensions = {
191 def suitable(self, url):
192 """Receives a URL and returns True if suitable for this IE."""
193 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
195 def report_lang(self):
196 """Report attempt to set language."""
197 self._downloader.to_screen(u'[youtube] Setting language')
199 def report_login(self):
200 """Report attempt to log in."""
201 self._downloader.to_screen(u'[youtube] Logging in')
203 def report_age_confirmation(self):
204 """Report attempt to confirm age."""
205 self._downloader.to_screen(u'[youtube] Confirming age')
207 def report_video_webpage_download(self, video_id):
208 """Report attempt to download video webpage."""
209 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
211 def report_video_info_webpage_download(self, video_id):
212 """Report attempt to download video info webpage."""
213 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
215 def report_video_subtitles_download(self, video_id):
216 """Report attempt to download video info webpage."""
217 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
219 def report_information_extraction(self, video_id):
220 """Report attempt to extract video information."""
221 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
223 def report_unavailable_format(self, video_id, format):
224 """Report extracted video URL."""
225 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
227 def report_rtmp_download(self):
228 """Indicate the download will use the RTMP protocol."""
229 self._downloader.to_screen(u'[youtube] RTMP download detected')
231 def _closed_captions_xml_to_srt(self, xml_string):
233 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
234 # TODO parse xml instead of regex
235 for n, (start, dur_tag, dur, caption) in enumerate(texts):
236 if not dur: dur = '4'
238 end = start + float(dur)
239 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
240 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
241 caption = unescapeHTML(caption)
242 caption = unescapeHTML(caption) # double cycle, intentional
243 srt += str(n+1) + '\n'
244 srt += start + ' --> ' + end + '\n'
245 srt += caption + '\n\n'
248 def _extract_subtitles(self, video_id):
249 self.report_video_subtitles_download(video_id)
250 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
252 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
253 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
254 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
255 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
256 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
257 if not srt_lang_list:
258 return (u'WARNING: video has no closed captions', None)
259 if self._downloader.params.get('subtitleslang', False):
260 srt_lang = self._downloader.params.get('subtitleslang')
261 elif 'en' in srt_lang_list:
264 srt_lang = list(srt_lang_list.keys())[0]
265 if not srt_lang in srt_lang_list:
266 return (u'WARNING: no closed captions found in the specified language', None)
267 params = compat_urllib_parse.urlencode({
269 'name': srt_lang_list[srt_lang].encode('utf-8'),
272 url = 'http://www.youtube.com/api/timedtext?' + params
274 srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8')
275 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
276 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
278 return (u'WARNING: Did not fetch video subtitles', None)
279 return (None, self._closed_captions_xml_to_srt(srt_xml))
281 def _print_formats(self, formats):
282 print('Available formats:')
284 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
286 def _real_initialize(self):
287 if self._downloader is None:
292 downloader_params = self._downloader.params
294 # Attempt to use provided username and password or .netrc data
295 if downloader_params.get('username', None) is not None:
296 username = downloader_params['username']
297 password = downloader_params['password']
298 elif downloader_params.get('usenetrc', False):
300 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
305 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
306 except (IOError, netrc.NetrcParseError) as err:
307 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
311 request = compat_urllib_request.Request(self._LANG_URL)
314 compat_urllib_request.urlopen(request).read()
315 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
316 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
319 # No authentication to be performed
323 request = compat_urllib_request.Request(self._LOGIN_URL)
325 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
326 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
327 self._downloader.to_stderr(u'WARNING: unable to fetch login page: %s' % compat_str(err))
332 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
334 galx = match.group(1)
336 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
342 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
346 u'PersistentCookie': u'yes',
348 u'bgresponse': u'js_disabled',
349 u'checkConnection': u'',
350 u'checkedDomains': u'youtube',
356 u'signIn': u'Sign in',
358 u'service': u'youtube',
362 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
364 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
365 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
366 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
369 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
370 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
371 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
373 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
374 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
380 'action_confirm': 'Confirm',
382 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
384 self.report_age_confirmation()
385 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
386 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
387 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
390 def _extract_id(self, url):
391 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
393 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
395 video_id = mobj.group(2)
398 def _real_extract(self, url):
399 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
400 mobj = re.search(self._NEXT_URL_RE, url)
402 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
403 video_id = self._extract_id(url)
406 self.report_video_webpage_download(video_id)
407 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
408 request = compat_urllib_request.Request(url)
410 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
411 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
412 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
415 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
417 # Attempt to extract SWF player URL
418 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
420 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
425 self.report_video_info_webpage_download(video_id)
426 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
427 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
428 % (video_id, el_type))
429 request = compat_urllib_request.Request(video_info_url)
431 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
432 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
433 video_info = compat_parse_qs(video_info_webpage)
434 if 'token' in video_info:
436 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
437 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
439 if 'token' not in video_info:
440 if 'reason' in video_info:
441 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
443 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
446 # Check for "rental" videos
447 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
448 self._downloader.trouble(u'ERROR: "rental" videos not supported')
451 # Start extracting information
452 self.report_information_extraction(video_id)
455 if 'author' not in video_info:
456 self._downloader.trouble(u'ERROR: unable to extract uploader name')
458 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
461 video_uploader_id = None
462 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
464 video_uploader_id = mobj.group(1)
466 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
469 if 'title' not in video_info:
470 self._downloader.trouble(u'ERROR: unable to extract video title')
472 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
475 if 'thumbnail_url' not in video_info:
476 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
478 else: # don't panic if we can't find it
479 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
483 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
485 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
486 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
487 for expression in format_expressions:
489 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
494 video_description = get_element_by_id("eow-description", video_webpage)
495 if video_description:
496 video_description = clean_html(video_description)
498 video_description = ''
501 video_subtitles = None
502 if self._downloader.params.get('writesubtitles', False):
503 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
505 self._downloader.trouble(srt_error)
507 if 'length_seconds' not in video_info:
508 self._downloader.trouble(u'WARNING: unable to extract video duration')
511 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
514 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
516 # Decide which formats to download
517 req_format = self._downloader.params.get('format', None)
519 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
520 self.report_rtmp_download()
521 video_url_list = [(None, video_info['conn'][0])]
522 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
523 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
524 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
525 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
526 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
528 format_limit = self._downloader.params.get('format_limit', None)
529 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
530 if format_limit is not None and format_limit in available_formats:
531 format_list = available_formats[available_formats.index(format_limit):]
533 format_list = available_formats
534 existing_formats = [x for x in format_list if x in url_map]
535 if len(existing_formats) == 0:
536 self._downloader.trouble(u'ERROR: no known formats available for video')
538 if self._downloader.params.get('listformats', None):
539 self._print_formats(existing_formats)
541 if req_format is None or req_format == 'best':
542 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
543 elif req_format == 'worst':
544 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
545 elif req_format in ('-1', 'all'):
546 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
548 # Specific formats. We pick the first in a slash-delimeted sequence.
549 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
550 req_formats = req_format.split('/')
551 video_url_list = None
552 for rf in req_formats:
554 video_url_list = [(rf, url_map[rf])]
556 if video_url_list is None:
557 self._downloader.trouble(u'ERROR: requested format not available')
560 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
564 for format_param, video_real_url in video_url_list:
566 video_extension = self._video_extensions.get(format_param, 'flv')
568 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
569 self._video_dimensions.get(format_param, '???'))
573 'url': video_real_url,
574 'uploader': video_uploader,
575 'uploader_id': video_uploader_id,
576 'upload_date': upload_date,
577 'title': video_title,
578 'ext': video_extension,
579 'format': video_format,
580 'thumbnail': video_thumbnail,
581 'description': video_description,
582 'player_url': player_url,
583 'subtitles': video_subtitles,
584 'duration': video_duration
589 class MetacafeIE(InfoExtractor):
590 """Information Extractor for metacafe.com."""
592 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
593 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
594 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
595 IE_NAME = u'metacafe'
597 def __init__(self, downloader=None):
598 InfoExtractor.__init__(self, downloader)
600 def report_disclaimer(self):
601 """Report disclaimer retrieval."""
602 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
604 def report_age_confirmation(self):
605 """Report attempt to confirm age."""
606 self._downloader.to_screen(u'[metacafe] Confirming age')
608 def report_download_webpage(self, video_id):
609 """Report webpage download."""
610 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
612 def report_extraction(self, video_id):
613 """Report information extraction."""
614 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
616 def _real_initialize(self):
617 # Retrieve disclaimer
618 request = compat_urllib_request.Request(self._DISCLAIMER)
620 self.report_disclaimer()
621 disclaimer = compat_urllib_request.urlopen(request).read()
622 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
623 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
629 'submit': "Continue - I'm over 18",
631 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
633 self.report_age_confirmation()
634 disclaimer = compat_urllib_request.urlopen(request).read()
635 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
636 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
639 def _real_extract(self, url):
640 # Extract id and simplified title from URL
641 mobj = re.match(self._VALID_URL, url)
643 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
646 video_id = mobj.group(1)
648 # Check if video comes from YouTube
649 mobj2 = re.match(r'^yt-(.*)$', video_id)
650 if mobj2 is not None:
651 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
654 # Retrieve video webpage to extract further information
655 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
657 self.report_download_webpage(video_id)
658 webpage = compat_urllib_request.urlopen(request).read()
659 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
660 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
663 # Extract URL, uploader and title from webpage
664 self.report_extraction(video_id)
665 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
667 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
668 video_extension = mediaURL[-3:]
670 # Extract gdaKey if available
671 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
675 gdaKey = mobj.group(1)
676 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
678 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
680 self._downloader.trouble(u'ERROR: unable to extract media URL')
682 vardict = compat_parse_qs(mobj.group(1))
683 if 'mediaData' not in vardict:
684 self._downloader.trouble(u'ERROR: unable to extract media URL')
686 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
688 self._downloader.trouble(u'ERROR: unable to extract media URL')
690 mediaURL = mobj.group(1).replace('\\/', '/')
691 video_extension = mediaURL[-3:]
692 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
694 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
696 self._downloader.trouble(u'ERROR: unable to extract title')
698 video_title = mobj.group(1).decode('utf-8')
700 mobj = re.search(r'submitter=(.*?);', webpage)
702 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
704 video_uploader = mobj.group(1)
707 'id': video_id.decode('utf-8'),
708 'url': video_url.decode('utf-8'),
709 'uploader': video_uploader.decode('utf-8'),
711 'title': video_title,
712 'ext': video_extension.decode('utf-8'),
716 class DailymotionIE(InfoExtractor):
717 """Information Extractor for Dailymotion"""
719 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
720 IE_NAME = u'dailymotion'
723 def __init__(self, downloader=None):
724 InfoExtractor.__init__(self, downloader)
726 def report_extraction(self, video_id):
727 """Report information extraction."""
728 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
730 def _real_extract(self, url):
731 # Extract id and simplified title from URL
732 mobj = re.match(self._VALID_URL, url)
734 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
737 video_id = mobj.group(1).split('_')[0].split('?')[0]
739 video_extension = 'mp4'
741 # Retrieve video webpage to extract further information
742 request = compat_urllib_request.Request(url)
743 request.add_header('Cookie', 'family_filter=off')
744 webpage = self._download_webpage(request, video_id)
746 # Extract URL, uploader and title from webpage
747 self.report_extraction(video_id)
748 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
750 self._downloader.trouble(u'ERROR: unable to extract media URL')
752 flashvars = compat_urllib_parse.unquote(mobj.group(1))
754 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
757 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
760 self._downloader.trouble(u'ERROR: unable to extract video URL')
763 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
765 self._downloader.trouble(u'ERROR: unable to extract video URL')
768 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
770 # TODO: support choosing qualities
772 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
774 self._downloader.trouble(u'ERROR: unable to extract title')
776 video_title = unescapeHTML(mobj.group('title'))
778 video_uploader = None
779 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
781 # lookin for official user
782 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
783 if mobj_official is None:
784 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
786 video_uploader = mobj_official.group(1)
788 video_uploader = mobj.group(1)
790 video_upload_date = None
791 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
793 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
798 'uploader': video_uploader,
799 'upload_date': video_upload_date,
800 'title': video_title,
801 'ext': video_extension,
805 class PhotobucketIE(InfoExtractor):
806 """Information extractor for photobucket.com."""
808 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
809 IE_NAME = u'photobucket'
811 def __init__(self, downloader=None):
812 InfoExtractor.__init__(self, downloader)
814 def report_download_webpage(self, video_id):
815 """Report webpage download."""
816 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
818 def report_extraction(self, video_id):
819 """Report information extraction."""
820 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
822 def _real_extract(self, url):
823 # Extract id from URL
824 mobj = re.match(self._VALID_URL, url)
826 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
829 video_id = mobj.group(1)
831 video_extension = 'flv'
833 # Retrieve video webpage to extract further information
834 request = compat_urllib_request.Request(url)
836 self.report_download_webpage(video_id)
837 webpage = compat_urllib_request.urlopen(request).read()
838 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
839 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
842 # Extract URL, uploader, and title from webpage
843 self.report_extraction(video_id)
844 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
846 self._downloader.trouble(u'ERROR: unable to extract media URL')
848 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
852 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
854 self._downloader.trouble(u'ERROR: unable to extract title')
856 video_title = mobj.group(1).decode('utf-8')
858 video_uploader = mobj.group(2).decode('utf-8')
861 'id': video_id.decode('utf-8'),
862 'url': video_url.decode('utf-8'),
863 'uploader': video_uploader,
865 'title': video_title,
866 'ext': video_extension.decode('utf-8'),
870 class YahooIE(InfoExtractor):
871 """Information extractor for video.yahoo.com."""
874 # _VALID_URL matches all Yahoo! Video URLs
875 # _VPAGE_URL matches only the extractable '/watch/' URLs
876 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
877 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
878 IE_NAME = u'video.yahoo'
880 def __init__(self, downloader=None):
881 InfoExtractor.__init__(self, downloader)
883 def report_download_webpage(self, video_id):
884 """Report webpage download."""
885 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
887 def report_extraction(self, video_id):
888 """Report information extraction."""
889 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
891 def _real_extract(self, url, new_video=True):
892 # Extract ID from URL
893 mobj = re.match(self._VALID_URL, url)
895 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
898 video_id = mobj.group(2)
899 video_extension = 'flv'
901 # Rewrite valid but non-extractable URLs as
902 # extractable English language /watch/ URLs
903 if re.match(self._VPAGE_URL, url) is None:
904 request = compat_urllib_request.Request(url)
906 webpage = compat_urllib_request.urlopen(request).read()
907 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
908 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
911 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
913 self._downloader.trouble(u'ERROR: Unable to extract id field')
915 yahoo_id = mobj.group(1)
917 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
919 self._downloader.trouble(u'ERROR: Unable to extract vid field')
921 yahoo_vid = mobj.group(1)
923 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
924 return self._real_extract(url, new_video=False)
926 # Retrieve video webpage to extract further information
927 request = compat_urllib_request.Request(url)
929 self.report_download_webpage(video_id)
930 webpage = compat_urllib_request.urlopen(request).read()
931 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
932 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
935 # Extract uploader and title from webpage
936 self.report_extraction(video_id)
937 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
939 self._downloader.trouble(u'ERROR: unable to extract video title')
941 video_title = mobj.group(1).decode('utf-8')
943 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
945 self._downloader.trouble(u'ERROR: unable to extract video uploader')
947 video_uploader = mobj.group(1).decode('utf-8')
949 # Extract video thumbnail
950 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
952 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
954 video_thumbnail = mobj.group(1).decode('utf-8')
956 # Extract video description
957 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
959 self._downloader.trouble(u'ERROR: unable to extract video description')
961 video_description = mobj.group(1).decode('utf-8')
962 if not video_description:
963 video_description = 'No description available.'
965 # Extract video height and width
966 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
968 self._downloader.trouble(u'ERROR: unable to extract video height')
970 yv_video_height = mobj.group(1)
972 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
974 self._downloader.trouble(u'ERROR: unable to extract video width')
976 yv_video_width = mobj.group(1)
978 # Retrieve video playlist to extract media URL
979 # I'm not completely sure what all these options are, but we
980 # seem to need most of them, otherwise the server sends a 401.
981 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
982 yv_bitrate = '700' # according to Wikipedia this is hard-coded
983 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
984 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
985 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
987 self.report_download_webpage(video_id)
988 webpage = compat_urllib_request.urlopen(request).read()
989 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
990 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
993 # Extract media URL from playlist XML
994 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
996 self._downloader.trouble(u'ERROR: Unable to extract media URL')
998 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
999 video_url = unescapeHTML(video_url)
1002 'id': video_id.decode('utf-8'),
1004 'uploader': video_uploader,
1005 'upload_date': None,
1006 'title': video_title,
1007 'ext': video_extension.decode('utf-8'),
1008 'thumbnail': video_thumbnail.decode('utf-8'),
1009 'description': video_description,
1013 class VimeoIE(InfoExtractor):
1014 """Information extractor for vimeo.com."""
1016 # _VALID_URL matches Vimeo URLs
1017 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1020 def __init__(self, downloader=None):
1021 InfoExtractor.__init__(self, downloader)
1023 def report_download_webpage(self, video_id):
1024 """Report webpage download."""
1025 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1027 def report_extraction(self, video_id):
1028 """Report information extraction."""
1029 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1031 def _real_extract(self, url, new_video=True):
1032 # Extract ID from URL
1033 mobj = re.match(self._VALID_URL, url)
1035 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1038 video_id = mobj.group('id')
1039 if not mobj.group('proto'):
1040 url = 'https://' + url
1041 if mobj.group('direct_link'):
1042 url = 'https://vimeo.com/' + video_id
1044 # Retrieve video webpage to extract further information
1045 request = compat_urllib_request.Request(url, None, std_headers)
1047 self.report_download_webpage(video_id)
1048 webpage_bytes = compat_urllib_request.urlopen(request).read()
1049 webpage = webpage_bytes.decode('utf-8')
1050 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1051 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1054 # Now we begin extracting as much information as we can from what we
1055 # retrieved. First we extract the information common to all extractors,
1056 # and latter we extract those that are Vimeo specific.
1057 self.report_extraction(video_id)
1059 # Extract the config JSON
1061 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1062 config = json.loads(config)
1064 self._downloader.trouble(u'ERROR: unable to extract info section')
1068 video_title = config["video"]["title"]
1070 # Extract uploader and uploader_id
1071 video_uploader = config["video"]["owner"]["name"]
1072 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1074 # Extract video thumbnail
1075 video_thumbnail = config["video"]["thumbnail"]
1077 # Extract video description
1078 video_description = get_element_by_attribute("itemprop", "description", webpage)
1079 if video_description: video_description = clean_html(video_description)
1080 else: video_description = ''
1082 # Extract upload date
1083 video_upload_date = None
1084 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1085 if mobj is not None:
1086 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1088 # Vimeo specific: extract request signature and timestamp
1089 sig = config['request']['signature']
1090 timestamp = config['request']['timestamp']
1092 # Vimeo specific: extract video codec and quality information
1093 # First consider quality, then codecs, then take everything
1094 # TODO bind to format param
1095 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1096 files = { 'hd': [], 'sd': [], 'other': []}
1097 for codec_name, codec_extension in codecs:
1098 if codec_name in config["video"]["files"]:
1099 if 'hd' in config["video"]["files"][codec_name]:
1100 files['hd'].append((codec_name, codec_extension, 'hd'))
1101 elif 'sd' in config["video"]["files"][codec_name]:
1102 files['sd'].append((codec_name, codec_extension, 'sd'))
1104 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1106 for quality in ('hd', 'sd', 'other'):
1107 if len(files[quality]) > 0:
1108 video_quality = files[quality][0][2]
1109 video_codec = files[quality][0][0]
1110 video_extension = files[quality][0][1]
1111 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1114 self._downloader.trouble(u'ERROR: no known codec found')
1117 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1118 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1123 'uploader': video_uploader,
1124 'uploader_id': video_uploader_id,
1125 'upload_date': video_upload_date,
1126 'title': video_title,
1127 'ext': video_extension,
1128 'thumbnail': video_thumbnail,
1129 'description': video_description,
1133 class ArteTvIE(InfoExtractor):
1134 """arte.tv information extractor."""
1136 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1137 _LIVE_URL = r'index-[0-9]+\.html$'
1139 IE_NAME = u'arte.tv'
1141 def __init__(self, downloader=None):
1142 InfoExtractor.__init__(self, downloader)
1144 def report_download_webpage(self, video_id):
1145 """Report webpage download."""
1146 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1148 def report_extraction(self, video_id):
1149 """Report information extraction."""
1150 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1152 def fetch_webpage(self, url):
1153 request = compat_urllib_request.Request(url)
1155 self.report_download_webpage(url)
1156 webpage = compat_urllib_request.urlopen(request).read()
1157 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1158 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1160 except ValueError as err:
1161 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1165 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1166 page = self.fetch_webpage(url)
1167 mobj = re.search(regex, page, regexFlags)
1171 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1174 for (i, key, err) in matchTuples:
1175 if mobj.group(i) is None:
1176 self._downloader.trouble(err)
1179 info[key] = mobj.group(i)
1183 def extractLiveStream(self, url):
1184 video_lang = url.split('/')[-4]
1185 info = self.grep_webpage(
1187 r'src="(.*?/videothek_js.*?\.js)',
1190 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1193 http_host = url.split('/')[2]
1194 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1195 info = self.grep_webpage(
1197 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1198 '(http://.*?\.swf).*?' +
1202 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1203 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1204 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1207 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1209 def extractPlus7Stream(self, url):
1210 video_lang = url.split('/')[-3]
1211 info = self.grep_webpage(
1213 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1216 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1219 next_url = compat_urllib_parse.unquote(info.get('url'))
1220 info = self.grep_webpage(
1222 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1225 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1228 next_url = compat_urllib_parse.unquote(info.get('url'))
1230 info = self.grep_webpage(
1232 r'<video id="(.*?)".*?>.*?' +
1233 '<name>(.*?)</name>.*?' +
1234 '<dateVideo>(.*?)</dateVideo>.*?' +
1235 '<url quality="hd">(.*?)</url>',
1238 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1239 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1240 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1241 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1246 'id': info.get('id'),
1247 'url': compat_urllib_parse.unquote(info.get('url')),
1248 'uploader': u'arte.tv',
1249 'upload_date': info.get('date'),
1250 'title': info.get('title').decode('utf-8'),
1256 def _real_extract(self, url):
1257 video_id = url.split('/')[-1]
1258 self.report_extraction(video_id)
1260 if re.search(self._LIVE_URL, video_id) is not None:
1261 self.extractLiveStream(url)
1264 info = self.extractPlus7Stream(url)
1269 class GenericIE(InfoExtractor):
1270 """Generic last-resort information extractor."""
1273 IE_NAME = u'generic'
1275 def __init__(self, downloader=None):
1276 InfoExtractor.__init__(self, downloader)
1278 def report_download_webpage(self, video_id):
1279 """Report webpage download."""
1280 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1281 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1283 def report_extraction(self, video_id):
1284 """Report information extraction."""
1285 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1287 def report_following_redirect(self, new_url):
1288 """Report information extraction."""
1289 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1291 def _test_redirect(self, url):
1292 """Check if it is a redirect, like url shorteners, in case restart chain."""
1293 class HeadRequest(compat_urllib_request.Request):
1294 def get_method(self):
1297 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1299 Subclass the HTTPRedirectHandler to make it use our
1300 HeadRequest also on the redirected URL
1302 def redirect_request(self, req, fp, code, msg, headers, newurl):
1303 if code in (301, 302, 303, 307):
1304 newurl = newurl.replace(' ', '%20')
1305 newheaders = dict((k,v) for k,v in req.headers.items()
1306 if k.lower() not in ("content-length", "content-type"))
1307 return HeadRequest(newurl,
1309 origin_req_host=req.get_origin_req_host(),
1312 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1314 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1316 Fallback to GET if HEAD is not allowed (405 HTTP error)
1318 def http_error_405(self, req, fp, code, msg, headers):
1322 newheaders = dict((k,v) for k,v in req.headers.items()
1323 if k.lower() not in ("content-length", "content-type"))
1324 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1326 origin_req_host=req.get_origin_req_host(),
1330 opener = compat_urllib_request.OpenerDirector()
1331 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1332 HTTPMethodFallback, HEADRedirectHandler,
1333 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1334 opener.add_handler(handler())
1336 response = opener.open(HeadRequest(url))
1337 new_url = response.geturl()
1342 self.report_following_redirect(new_url)
1343 self._downloader.download([new_url])
1346 def _real_extract(self, url):
1347 if self._test_redirect(url): return
1349 video_id = url.split('/')[-1]
1350 request = compat_urllib_request.Request(url)
1352 self.report_download_webpage(video_id)
1353 webpage = compat_urllib_request.urlopen(request).read()
1354 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1355 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1357 except ValueError as err:
1358 # since this is the last-resort InfoExtractor, if
1359 # this error is thrown, it'll be thrown here
1360 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1363 self.report_extraction(video_id)
1364 # Start with something easy: JW Player in SWFObject
1365 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1367 # Broaden the search a little bit
1368 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1370 # Broaden the search a little bit: JWPlayer JS loader
1371 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1373 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1376 # It's possible that one of the regexes
1377 # matched, but returned an empty group:
1378 if mobj.group(1) is None:
1379 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1382 video_url = compat_urllib_parse.unquote(mobj.group(1))
1383 video_id = os.path.basename(video_url)
1385 # here's a fun little line of code for you:
1386 video_extension = os.path.splitext(video_id)[1][1:]
1387 video_id = os.path.splitext(video_id)[0]
1389 # it's tempting to parse this further, but you would
1390 # have to take into account all the variations like
1391 # Video Title - Site Name
1392 # Site Name | Video Title
1393 # Video Title - Tagline | Site Name
1394 # and so on and so forth; it's just not practical
1395 mobj = re.search(r'<title>(.*)</title>', webpage)
1397 self._downloader.trouble(u'ERROR: unable to extract title')
1399 video_title = mobj.group(1)
1401 # video uploader is domain name
1402 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1404 self._downloader.trouble(u'ERROR: unable to extract title')
1406 video_uploader = mobj.group(1)
1411 'uploader': video_uploader,
1412 'upload_date': None,
1413 'title': video_title,
1414 'ext': video_extension,
1418 class YoutubeSearchIE(InfoExtractor):
1419 """Information Extractor for YouTube search queries."""
1420 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1421 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1422 _max_youtube_results = 1000
1423 IE_NAME = u'youtube:search'
1425 def __init__(self, downloader=None):
1426 InfoExtractor.__init__(self, downloader)
1428 def report_download_page(self, query, pagenum):
1429 """Report attempt to download search page with given number."""
1430 query = query.decode(preferredencoding())
1431 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1433 def _real_extract(self, query):
1434 mobj = re.match(self._VALID_URL, query)
1436 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1439 prefix, query = query.split(':')
1441 query = query.encode('utf-8')
1443 self._download_n_results(query, 1)
1445 elif prefix == 'all':
1446 self._download_n_results(query, self._max_youtube_results)
1452 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1454 elif n > self._max_youtube_results:
1455 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1456 n = self._max_youtube_results
1457 self._download_n_results(query, n)
1459 except ValueError: # parsing prefix as integer fails
1460 self._download_n_results(query, 1)
1463 def _download_n_results(self, query, n):
1464 """Downloads a specified number of results for a query"""
1470 while (50 * pagenum) < limit:
1471 self.report_download_page(query, pagenum+1)
1472 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1473 request = compat_urllib_request.Request(result_url)
1475 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1476 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1477 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1479 api_response = json.loads(data)['data']
1481 if not 'items' in api_response:
1482 self._downloader.trouble(u'[youtube] No video results')
1485 new_ids = list(video['id'] for video in api_response['items'])
1486 video_ids += new_ids
1488 limit = min(n, api_response['totalItems'])
1491 if len(video_ids) > n:
1492 video_ids = video_ids[:n]
1493 for id in video_ids:
1494 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1498 class GoogleSearchIE(InfoExtractor):
1499 """Information Extractor for Google Video search queries."""
1500 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1501 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1502 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1503 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1504 _max_google_results = 1000
1505 IE_NAME = u'video.google:search'
1507 def __init__(self, downloader=None):
1508 InfoExtractor.__init__(self, downloader)
1510 def report_download_page(self, query, pagenum):
1511 """Report attempt to download playlist page with given number."""
1512 query = query.decode(preferredencoding())
1513 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1515 def _real_extract(self, query):
1516 mobj = re.match(self._VALID_URL, query)
1518 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1521 prefix, query = query.split(':')
1523 query = query.encode('utf-8')
1525 self._download_n_results(query, 1)
1527 elif prefix == 'all':
1528 self._download_n_results(query, self._max_google_results)
1534 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1536 elif n > self._max_google_results:
1537 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1538 n = self._max_google_results
1539 self._download_n_results(query, n)
1541 except ValueError: # parsing prefix as integer fails
1542 self._download_n_results(query, 1)
1545 def _download_n_results(self, query, n):
1546 """Downloads a specified number of results for a query"""
1552 self.report_download_page(query, pagenum)
1553 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1554 request = compat_urllib_request.Request(result_url)
1556 page = compat_urllib_request.urlopen(request).read()
1557 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1558 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1561 # Extract video identifiers
1562 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1563 video_id = mobj.group(1)
1564 if video_id not in video_ids:
1565 video_ids.append(video_id)
1566 if len(video_ids) == n:
1567 # Specified n videos reached
1568 for id in video_ids:
1569 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1572 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1573 for id in video_ids:
1574 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1577 pagenum = pagenum + 1
1580 class YahooSearchIE(InfoExtractor):
1581 """Information Extractor for Yahoo! Video search queries."""
1584 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1585 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1586 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1587 _MORE_PAGES_INDICATOR = r'\s*Next'
1588 _max_yahoo_results = 1000
1589 IE_NAME = u'video.yahoo:search'
1591 def __init__(self, downloader=None):
1592 InfoExtractor.__init__(self, downloader)
1594 def report_download_page(self, query, pagenum):
1595 """Report attempt to download playlist page with given number."""
1596 query = query.decode(preferredencoding())
1597 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1599 def _real_extract(self, query):
1600 mobj = re.match(self._VALID_URL, query)
1602 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1605 prefix, query = query.split(':')
1607 query = query.encode('utf-8')
1609 self._download_n_results(query, 1)
1611 elif prefix == 'all':
1612 self._download_n_results(query, self._max_yahoo_results)
1618 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1620 elif n > self._max_yahoo_results:
1621 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1622 n = self._max_yahoo_results
1623 self._download_n_results(query, n)
1625 except ValueError: # parsing prefix as integer fails
1626 self._download_n_results(query, 1)
1629 def _download_n_results(self, query, n):
1630 """Downloads a specified number of results for a query"""
1633 already_seen = set()
1637 self.report_download_page(query, pagenum)
1638 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1639 request = compat_urllib_request.Request(result_url)
1641 page = compat_urllib_request.urlopen(request).read()
1642 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1643 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1646 # Extract video identifiers
1647 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1648 video_id = mobj.group(1)
1649 if video_id not in already_seen:
1650 video_ids.append(video_id)
1651 already_seen.add(video_id)
1652 if len(video_ids) == n:
1653 # Specified n videos reached
1654 for id in video_ids:
1655 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1658 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1659 for id in video_ids:
1660 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1663 pagenum = pagenum + 1
1666 class YoutubePlaylistIE(InfoExtractor):
1667 """Information Extractor for YouTube playlists."""
1669 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1670 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1671 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1672 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1673 IE_NAME = u'youtube:playlist'
1675 def __init__(self, downloader=None):
1676 InfoExtractor.__init__(self, downloader)
1678 def report_download_page(self, playlist_id, pagenum):
1679 """Report attempt to download playlist page with given number."""
1680 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1682 def _real_extract(self, url):
1683 # Extract playlist id
1684 mobj = re.match(self._VALID_URL, url)
1686 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1690 if mobj.group(3) is not None:
1691 self._downloader.download([mobj.group(3)])
1694 # Download playlist pages
1695 # prefix is 'p' as default for playlists but there are other types that need extra care
1696 playlist_prefix = mobj.group(1)
1697 if playlist_prefix == 'a':
1698 playlist_access = 'artist'
1700 playlist_prefix = 'p'
1701 playlist_access = 'view_play_list'
1702 playlist_id = mobj.group(2)
1707 self.report_download_page(playlist_id, pagenum)
1708 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1709 request = compat_urllib_request.Request(url)
1711 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1712 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1713 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1716 # Extract video identifiers
1718 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1719 if mobj.group(1) not in ids_in_page:
1720 ids_in_page.append(mobj.group(1))
1721 video_ids.extend(ids_in_page)
1723 if self._MORE_PAGES_INDICATOR not in page:
1725 pagenum = pagenum + 1
1727 total = len(video_ids)
1729 playliststart = self._downloader.params.get('playliststart', 1) - 1
1730 playlistend = self._downloader.params.get('playlistend', -1)
1731 if playlistend == -1:
1732 video_ids = video_ids[playliststart:]
1734 video_ids = video_ids[playliststart:playlistend]
1736 if len(video_ids) == total:
1737 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1739 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1741 for id in video_ids:
1742 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1746 class YoutubeChannelIE(InfoExtractor):
1747 """Information Extractor for YouTube channels."""
1749 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1750 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1751 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1752 IE_NAME = u'youtube:channel'
1754 def report_download_page(self, channel_id, pagenum):
1755 """Report attempt to download channel page with given number."""
1756 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1758 def _real_extract(self, url):
1759 # Extract channel id
1760 mobj = re.match(self._VALID_URL, url)
1762 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1765 # Download channel pages
1766 channel_id = mobj.group(1)
1771 self.report_download_page(channel_id, pagenum)
1772 url = self._TEMPLATE_URL % (channel_id, pagenum)
1773 request = compat_urllib_request.Request(url)
1775 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1776 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1777 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1780 # Extract video identifiers
1782 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1783 if mobj.group(1) not in ids_in_page:
1784 ids_in_page.append(mobj.group(1))
1785 video_ids.extend(ids_in_page)
1787 if self._MORE_PAGES_INDICATOR not in page:
1789 pagenum = pagenum + 1
1791 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1793 for id in video_ids:
1794 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1798 class YoutubeUserIE(InfoExtractor):
1799 """Information Extractor for YouTube users."""
1801 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1802 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1803 _GDATA_PAGE_SIZE = 50
1804 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1805 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1806 IE_NAME = u'youtube:user'
1808 def __init__(self, downloader=None):
1809 InfoExtractor.__init__(self, downloader)
1811 def report_download_page(self, username, start_index):
1812 """Report attempt to download user page."""
1813 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1814 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1816 def _real_extract(self, url):
1818 mobj = re.match(self._VALID_URL, url)
1820 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1823 username = mobj.group(1)
1825 # Download video ids using YouTube Data API. Result size per
1826 # query is limited (currently to 50 videos) so we need to query
1827 # page by page until there are no video ids - it means we got
1834 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1835 self.report_download_page(username, start_index)
1837 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1840 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1841 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1842 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1845 # Extract video identifiers
1848 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1849 if mobj.group(1) not in ids_in_page:
1850 ids_in_page.append(mobj.group(1))
1852 video_ids.extend(ids_in_page)
1854 # A little optimization - if current page is not
1855 # "full", ie. does not contain PAGE_SIZE video ids then
1856 # we can assume that this page is the last one - there
1857 # are no more ids on further pages - no need to query
1860 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1865 all_ids_count = len(video_ids)
1866 playliststart = self._downloader.params.get('playliststart', 1) - 1
1867 playlistend = self._downloader.params.get('playlistend', -1)
1869 if playlistend == -1:
1870 video_ids = video_ids[playliststart:]
1872 video_ids = video_ids[playliststart:playlistend]
1874 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1875 (username, all_ids_count, len(video_ids)))
1877 for video_id in video_ids:
1878 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1881 class BlipTVUserIE(InfoExtractor):
1882 """Information Extractor for blip.tv users."""
1884 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1886 IE_NAME = u'blip.tv:user'
1888 def __init__(self, downloader=None):
1889 InfoExtractor.__init__(self, downloader)
1891 def report_download_page(self, username, pagenum):
1892 """Report attempt to download user page."""
1893 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1894 (self.IE_NAME, username, pagenum))
1896 def _real_extract(self, url):
1898 mobj = re.match(self._VALID_URL, url)
1900 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1903 username = mobj.group(1)
1905 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1907 request = compat_urllib_request.Request(url)
1910 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1911 mobj = re.search(r'data-users-id="([^"]+)"', page)
1912 page_base = page_base % mobj.group(1)
1913 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1914 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1918 # Download video ids using BlipTV Ajax calls. Result size per
1919 # query is limited (currently to 12 videos) so we need to query
1920 # page by page until there are no video ids - it means we got
1927 self.report_download_page(username, pagenum)
1928 url = page_base + "&page=" + str(pagenum)
1929 request = compat_urllib_request.Request( url )
1931 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1932 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1933 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1936 # Extract video identifiers
1939 for mobj in re.finditer(r'href="/([^"]+)"', page):
1940 if mobj.group(1) not in ids_in_page:
1941 ids_in_page.append(unescapeHTML(mobj.group(1)))
1943 video_ids.extend(ids_in_page)
1945 # A little optimization - if current page is not
1946 # "full", ie. does not contain PAGE_SIZE video ids then
1947 # we can assume that this page is the last one - there
1948 # are no more ids on further pages - no need to query
1951 if len(ids_in_page) < self._PAGE_SIZE:
1956 all_ids_count = len(video_ids)
1957 playliststart = self._downloader.params.get('playliststart', 1) - 1
1958 playlistend = self._downloader.params.get('playlistend', -1)
1960 if playlistend == -1:
1961 video_ids = video_ids[playliststart:]
1963 video_ids = video_ids[playliststart:playlistend]
1965 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1966 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1968 for video_id in video_ids:
1969 self._downloader.download([u'http://blip.tv/'+video_id])
1972 class DepositFilesIE(InfoExtractor):
1973 """Information extractor for depositfiles.com"""
1975 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1977 def report_download_webpage(self, file_id):
1978 """Report webpage download."""
1979 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1981 def report_extraction(self, file_id):
1982 """Report information extraction."""
1983 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1985 def _real_extract(self, url):
1986 file_id = url.split('/')[-1]
1987 # Rebuild url in english locale
1988 url = 'http://depositfiles.com/en/files/' + file_id
1990 # Retrieve file webpage with 'Free download' button pressed
1991 free_download_indication = { 'gateway_result' : '1' }
1992 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1994 self.report_download_webpage(file_id)
1995 webpage = compat_urllib_request.urlopen(request).read()
1996 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1997 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
2000 # Search for the real file URL
2001 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
2002 if (mobj is None) or (mobj.group(1) is None):
2003 # Try to figure out reason of the error.
2004 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2005 if (mobj is not None) and (mobj.group(1) is not None):
2006 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2007 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2009 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2012 file_url = mobj.group(1)
2013 file_extension = os.path.splitext(file_url)[1][1:]
2015 # Search for file title
2016 mobj = re.search(r'<b title="(.*?)">', webpage)
2018 self._downloader.trouble(u'ERROR: unable to extract title')
2020 file_title = mobj.group(1).decode('utf-8')
2023 'id': file_id.decode('utf-8'),
2024 'url': file_url.decode('utf-8'),
2026 'upload_date': None,
2027 'title': file_title,
2028 'ext': file_extension.decode('utf-8'),
2032 class FacebookIE(InfoExtractor):
2033 """Information Extractor for Facebook"""
2035 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2036 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2037 _NETRC_MACHINE = 'facebook'
2038 IE_NAME = u'facebook'
2040 def report_login(self):
2041 """Report attempt to log in."""
2042 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2044 def _real_initialize(self):
2045 if self._downloader is None:
2050 downloader_params = self._downloader.params
2052 # Attempt to use provided username and password or .netrc data
2053 if downloader_params.get('username', None) is not None:
2054 useremail = downloader_params['username']
2055 password = downloader_params['password']
2056 elif downloader_params.get('usenetrc', False):
2058 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2059 if info is not None:
2063 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2064 except (IOError, netrc.NetrcParseError) as err:
2065 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2068 if useremail is None:
2077 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2080 login_results = compat_urllib_request.urlopen(request).read()
2081 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2082 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2084 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2085 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2088 def _real_extract(self, url):
2089 mobj = re.match(self._VALID_URL, url)
2091 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2093 video_id = mobj.group('ID')
2095 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2096 webpage = self._download_webpage(url, video_id)
2098 BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2099 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2100 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2102 raise ExtractorError(u'Cannot parse data')
2103 data = dict(json.loads(m.group(1)))
2104 params_raw = compat_urllib_parse.unquote(data['params'])
2105 params = json.loads(params_raw)
2106 video_url = params['hd_src']
2108 video_url = params['sd_src']
2110 raise ExtractorError(u'Cannot find video URL')
2111 video_duration = int(params['video_duration'])
2113 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2115 raise ExtractorError(u'Cannot find title in webpage')
2116 video_title = unescapeHTML(m.group(1))
2120 'title': video_title,
2123 'duration': video_duration,
2124 'thumbnail': params['thumbnail_src'],
2129 class BlipTVIE(InfoExtractor):
2130 """Information extractor for blip.tv"""
2132 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2133 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2134 IE_NAME = u'blip.tv'
2136 def report_extraction(self, file_id):
2137 """Report information extraction."""
2138 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2140 def report_direct_download(self, title):
2141 """Report information extraction."""
2142 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2144 def _real_extract(self, url):
2145 mobj = re.match(self._VALID_URL, url)
2147 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2154 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2155 request = compat_urllib_request.Request(json_url)
2156 request.add_header('User-Agent', 'iTunes/10.6.1')
2157 self.report_extraction(mobj.group(1))
2160 urlh = compat_urllib_request.urlopen(request)
2161 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2162 basename = url.split('/')[-1]
2163 title,ext = os.path.splitext(basename)
2164 title = title.decode('UTF-8')
2165 ext = ext.replace('.', '')
2166 self.report_direct_download(title)
2171 'upload_date': None,
2176 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2177 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2178 if info is None: # Regular URL
2180 json_code_bytes = urlh.read()
2181 json_code = json_code_bytes.decode('utf-8')
2182 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2183 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2187 json_data = json.loads(json_code)
2188 if 'Post' in json_data:
2189 data = json_data['Post']
2193 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2194 video_url = data['media']['url']
2195 umobj = re.match(self._URL_EXT, video_url)
2197 raise ValueError('Can not determine filename extension')
2198 ext = umobj.group(1)
2201 'id': data['item_id'],
2203 'uploader': data['display_name'],
2204 'upload_date': upload_date,
2205 'title': data['title'],
2207 'format': data['media']['mimeType'],
2208 'thumbnail': data['thumbnailUrl'],
2209 'description': data['description'],
2210 'player_url': data['embedUrl'],
2211 'user_agent': 'iTunes/10.6.1',
2213 except (ValueError,KeyError) as err:
2214 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2220 class MyVideoIE(InfoExtractor):
2221 """Information Extractor for myvideo.de."""
2223 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2224 IE_NAME = u'myvideo'
2226 def __init__(self, downloader=None):
2227 InfoExtractor.__init__(self, downloader)
2229 def report_extraction(self, video_id):
2230 """Report information extraction."""
2231 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2233 def _real_extract(self,url):
2234 mobj = re.match(self._VALID_URL, url)
2236 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2239 video_id = mobj.group(1)
2242 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2243 webpage = self._download_webpage(webpage_url, video_id)
2245 self.report_extraction(video_id)
2246 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
2249 self._downloader.trouble(u'ERROR: unable to extract media URL')
2251 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2253 mobj = re.search('<title>([^<]+)</title>', webpage)
2255 self._downloader.trouble(u'ERROR: unable to extract title')
2258 video_title = mobj.group(1)
2264 'upload_date': None,
2265 'title': video_title,
2269 class ComedyCentralIE(InfoExtractor):
2270 """Information extractor for The Daily Show and Colbert Report """
2272 # urls can be abbreviations like :thedailyshow or :colbert
2273 # urls for episodes like:
2274 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2275 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2276 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2277 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2278 |(https?://)?(www\.)?
2279 (?P<showname>thedailyshow|colbertnation)\.com/
2280 (full-episodes/(?P<episode>.*)|
2282 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2283 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2286 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2288 _video_extensions = {
2296 _video_dimensions = {
2305 def suitable(self, url):
2306 """Receives a URL and returns True if suitable for this IE."""
2307 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2309 def report_extraction(self, episode_id):
2310 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2312 def report_config_download(self, episode_id, media_id):
2313 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2315 def report_index_download(self, episode_id):
2316 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2318 def _print_formats(self, formats):
2319 print('Available formats:')
2321 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2324 def _real_extract(self, url):
2325 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2327 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2330 if mobj.group('shortname'):
2331 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2332 url = u'http://www.thedailyshow.com/full-episodes/'
2334 url = u'http://www.colbertnation.com/full-episodes/'
2335 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2336 assert mobj is not None
2338 if mobj.group('clip'):
2339 if mobj.group('showname') == 'thedailyshow':
2340 epTitle = mobj.group('tdstitle')
2342 epTitle = mobj.group('cntitle')
2345 dlNewest = not mobj.group('episode')
2347 epTitle = mobj.group('showname')
2349 epTitle = mobj.group('episode')
2351 req = compat_urllib_request.Request(url)
2352 self.report_extraction(epTitle)
2354 htmlHandle = compat_urllib_request.urlopen(req)
2355 html = htmlHandle.read()
2356 webpage = html.decode('utf-8')
2357 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2358 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2361 url = htmlHandle.geturl()
2362 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2364 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2366 if mobj.group('episode') == '':
2367 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2369 epTitle = mobj.group('episode')
2371 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2373 if len(mMovieParams) == 0:
2374 # The Colbert Report embeds the information in a without
2375 # a URL prefix; so extract the alternate reference
2376 # and then add the URL prefix manually.
2378 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2379 if len(altMovieParams) == 0:
2380 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2383 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2385 uri = mMovieParams[0][1]
2386 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2387 self.report_index_download(epTitle)
2389 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2390 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2391 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2396 idoc = xml.etree.ElementTree.fromstring(indexXml)
2397 itemEls = idoc.findall('.//item')
2398 for partNum,itemEl in enumerate(itemEls):
2399 mediaId = itemEl.findall('./guid')[0].text
2400 shortMediaId = mediaId.split(':')[-1]
2401 showId = mediaId.split(':')[-2].replace('.com', '')
2402 officialTitle = itemEl.findall('./title')[0].text
2403 officialDate = itemEl.findall('./pubDate')[0].text
2405 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2406 compat_urllib_parse.urlencode({'uri': mediaId}))
2407 configReq = compat_urllib_request.Request(configUrl)
2408 self.report_config_download(epTitle, shortMediaId)
2410 configXml = compat_urllib_request.urlopen(configReq).read()
2411 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2412 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2415 cdoc = xml.etree.ElementTree.fromstring(configXml)
2417 for rendition in cdoc.findall('.//rendition'):
2418 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2422 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2425 if self._downloader.params.get('listformats', None):
2426 self._print_formats([i[0] for i in turls])
2429 # For now, just pick the highest bitrate
2430 format,rtmp_video_url = turls[-1]
2432 # Get the format arg from the arg stream
2433 req_format = self._downloader.params.get('format', None)
2435 # Select format if we can find one
2438 format, rtmp_video_url = f, v
2441 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2443 raise ExtractorError(u'Cannot transform RTMP url')
2444 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2445 video_url = base + m.group('finalid')
2447 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2452 'upload_date': officialDate,
2457 'description': officialTitle,
2459 results.append(info)
2464 class EscapistIE(InfoExtractor):
2465 """Information extractor for The Escapist """
2467 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2468 IE_NAME = u'escapist'
2470 def report_extraction(self, showName):
2471 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2473 def report_config_download(self, showName):
2474 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2476 def _real_extract(self, url):
2477 mobj = re.match(self._VALID_URL, url)
2479 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2481 showName = mobj.group('showname')
2482 videoId = mobj.group('episode')
2484 self.report_extraction(showName)
2486 webPage = compat_urllib_request.urlopen(url)
2487 webPageBytes = webPage.read()
2488 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2489 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2490 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2491 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2494 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2495 description = unescapeHTML(descMatch.group(1))
2496 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2497 imgUrl = unescapeHTML(imgMatch.group(1))
2498 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2499 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2500 configUrlMatch = re.search('config=(.*)$', playerUrl)
2501 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2503 self.report_config_download(showName)
2505 configJSON = compat_urllib_request.urlopen(configUrl)
2506 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2507 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2508 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2509 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2512 # Technically, it's JavaScript, not JSON
2513 configJSON = configJSON.replace("'", '"')
2516 config = json.loads(configJSON)
2517 except (ValueError,) as err:
2518 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2521 playlist = config['playlist']
2522 videoUrl = playlist[1]['url']
2527 'uploader': showName,
2528 'upload_date': None,
2531 'thumbnail': imgUrl,
2532 'description': description,
2533 'player_url': playerUrl,
2538 class CollegeHumorIE(InfoExtractor):
2539 """Information extractor for collegehumor.com"""
2542 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2543 IE_NAME = u'collegehumor'
2545 def report_manifest(self, video_id):
2546 """Report information extraction."""
2547 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2549 def report_extraction(self, video_id):
2550 """Report information extraction."""
2551 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2553 def _real_extract(self, url):
2554 mobj = re.match(self._VALID_URL, url)
2556 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2558 video_id = mobj.group('videoid')
2563 'upload_date': None,
2566 self.report_extraction(video_id)
2567 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2569 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2570 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2571 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2574 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2576 videoNode = mdoc.findall('./video')[0]
2577 info['description'] = videoNode.findall('./description')[0].text
2578 info['title'] = videoNode.findall('./caption')[0].text
2579 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2580 manifest_url = videoNode.findall('./file')[0].text
2582 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2585 manifest_url += '?hdcore=2.10.3'
2586 self.report_manifest(video_id)
2588 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2589 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2590 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2593 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2595 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2596 node_id = media_node.attrib['url']
2597 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2598 except IndexError as err:
2599 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2602 url_pr = compat_urllib_parse_urlparse(manifest_url)
2603 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2610 class XVideosIE(InfoExtractor):
2611 """Information extractor for xvideos.com"""
2613 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2614 IE_NAME = u'xvideos'
2616 def report_extraction(self, video_id):
2617 """Report information extraction."""
2618 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2620 def _real_extract(self, url):
2621 mobj = re.match(self._VALID_URL, url)
2623 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2625 video_id = mobj.group(1)
2627 webpage = self._download_webpage(url, video_id)
2629 self.report_extraction(video_id)
2633 mobj = re.search(r'flv_url=(.+?)&', webpage)
2635 self._downloader.trouble(u'ERROR: unable to extract video url')
2637 video_url = compat_urllib_parse.unquote(mobj.group(1))
2641 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2643 self._downloader.trouble(u'ERROR: unable to extract video title')
2645 video_title = mobj.group(1)
2648 # Extract video thumbnail
2649 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2651 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2653 video_thumbnail = mobj.group(0)
2659 'upload_date': None,
2660 'title': video_title,
2662 'thumbnail': video_thumbnail,
2663 'description': None,
2669 class SoundcloudIE(InfoExtractor):
2670 """Information extractor for soundcloud.com
2671 To access the media, the uid of the song and a stream token
2672 must be extracted from the page source and the script must make
2673 a request to media.soundcloud.com/crossdomain.xml. Then
2674 the media can be grabbed by requesting from an url composed
2675 of the stream token and uid
2678 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2679 IE_NAME = u'soundcloud'
2681 def __init__(self, downloader=None):
2682 InfoExtractor.__init__(self, downloader)
2684 def report_resolve(self, video_id):
2685 """Report information extraction."""
2686 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2688 def report_extraction(self, video_id):
2689 """Report information extraction."""
2690 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2692 def _real_extract(self, url):
2693 mobj = re.match(self._VALID_URL, url)
2695 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2698 # extract uploader (which is in the url)
2699 uploader = mobj.group(1)
2700 # extract simple title (uploader + slug of song title)
2701 slug_title = mobj.group(2)
2702 simple_title = uploader + u'-' + slug_title
2704 self.report_resolve('%s/%s' % (uploader, slug_title))
2706 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2707 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2708 request = compat_urllib_request.Request(resolv_url)
2710 info_json_bytes = compat_urllib_request.urlopen(request).read()
2711 info_json = info_json_bytes.decode('utf-8')
2712 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2713 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2716 info = json.loads(info_json)
2717 video_id = info['id']
2718 self.report_extraction('%s/%s' % (uploader, slug_title))
2720 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2721 request = compat_urllib_request.Request(streams_url)
2723 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2724 stream_json = stream_json_bytes.decode('utf-8')
2725 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2726 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2729 streams = json.loads(stream_json)
2730 mediaURL = streams['http_mp3_128_url']
2735 'uploader': info['user']['username'],
2736 'upload_date': info['created_at'],
2737 'title': info['title'],
2739 'description': info['description'],
2743 class InfoQIE(InfoExtractor):
2744 """Information extractor for infoq.com"""
2745 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2747 def report_extraction(self, video_id):
2748 """Report information extraction."""
2749 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2751 def _real_extract(self, url):
2752 mobj = re.match(self._VALID_URL, url)
2754 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2757 webpage = self._download_webpage(url, video_id=url)
2758 self.report_extraction(url)
2761 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2763 self._downloader.trouble(u'ERROR: unable to extract video url')
2765 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2766 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2769 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2771 self._downloader.trouble(u'ERROR: unable to extract video title')
2773 video_title = mobj.group(1)
2775 # Extract description
2776 video_description = u'No description available.'
2777 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2778 if mobj is not None:
2779 video_description = mobj.group(1)
2781 video_filename = video_url.split('/')[-1]
2782 video_id, extension = video_filename.split('.')
2788 'upload_date': None,
2789 'title': video_title,
2790 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2792 'description': video_description,
2797 class MixcloudIE(InfoExtractor):
2798 """Information extractor for www.mixcloud.com"""
2800 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2801 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2802 IE_NAME = u'mixcloud'
2804 def __init__(self, downloader=None):
2805 InfoExtractor.__init__(self, downloader)
2807 def report_download_json(self, file_id):
2808 """Report JSON download."""
2809 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2811 def report_extraction(self, file_id):
2812 """Report information extraction."""
2813 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2815 def get_urls(self, jsonData, fmt, bitrate='best'):
2816 """Get urls from 'audio_formats' section in json"""
2819 bitrate_list = jsonData[fmt]
2820 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2821 bitrate = max(bitrate_list) # select highest
2823 url_list = jsonData[fmt][bitrate]
2824 except TypeError: # we have no bitrate info.
2825 url_list = jsonData[fmt]
2828 def check_urls(self, url_list):
2829 """Returns 1st active url from list"""
2830 for url in url_list:
2832 compat_urllib_request.urlopen(url)
2834 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2839 def _print_formats(self, formats):
2840 print('Available formats:')
2841 for fmt in formats.keys():
2842 for b in formats[fmt]:
2844 ext = formats[fmt][b][0]
2845 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2846 except TypeError: # we have no bitrate info
2847 ext = formats[fmt][0]
2848 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2851 def _real_extract(self, url):
2852 mobj = re.match(self._VALID_URL, url)
2854 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2856 # extract uploader & filename from url
2857 uploader = mobj.group(1).decode('utf-8')
2858 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2860 # construct API request
2861 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2862 # retrieve .json file with links to files
2863 request = compat_urllib_request.Request(file_url)
2865 self.report_download_json(file_url)
2866 jsonData = compat_urllib_request.urlopen(request).read()
2867 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2868 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2872 json_data = json.loads(jsonData)
2873 player_url = json_data['player_swf_url']
2874 formats = dict(json_data['audio_formats'])
2876 req_format = self._downloader.params.get('format', None)
2879 if self._downloader.params.get('listformats', None):
2880 self._print_formats(formats)
2883 if req_format is None or req_format == 'best':
2884 for format_param in formats.keys():
2885 url_list = self.get_urls(formats, format_param)
2887 file_url = self.check_urls(url_list)
2888 if file_url is not None:
2891 if req_format not in formats:
2892 self._downloader.trouble(u'ERROR: format is not available')
2895 url_list = self.get_urls(formats, req_format)
2896 file_url = self.check_urls(url_list)
2897 format_param = req_format
2900 'id': file_id.decode('utf-8'),
2901 'url': file_url.decode('utf-8'),
2902 'uploader': uploader.decode('utf-8'),
2903 'upload_date': None,
2904 'title': json_data['name'],
2905 'ext': file_url.split('.')[-1].decode('utf-8'),
2906 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2907 'thumbnail': json_data['thumbnail_url'],
2908 'description': json_data['description'],
2909 'player_url': player_url.decode('utf-8'),
2912 class StanfordOpenClassroomIE(InfoExtractor):
2913 """Information extractor for Stanford's Open ClassRoom"""
2915 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2916 IE_NAME = u'stanfordoc'
2918 def report_download_webpage(self, objid):
2919 """Report information extraction."""
2920 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2922 def report_extraction(self, video_id):
2923 """Report information extraction."""
2924 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2926 def _real_extract(self, url):
2927 mobj = re.match(self._VALID_URL, url)
2929 raise ExtractorError(u'Invalid URL: %s' % url)
2931 if mobj.group('course') and mobj.group('video'): # A specific video
2932 course = mobj.group('course')
2933 video = mobj.group('video')
2935 'id': course + '_' + video,
2937 'upload_date': None,
2940 self.report_extraction(info['id'])
2941 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2942 xmlUrl = baseUrl + video + '.xml'
2944 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2945 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2946 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2948 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2950 info['title'] = mdoc.findall('./title')[0].text
2951 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2953 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2955 info['ext'] = info['url'].rpartition('.')[2]
2957 elif mobj.group('course'): # A course page
2958 course = mobj.group('course')
2963 'upload_date': None,
2966 coursepage = self._download_webpage(url, info['id'],
2967 note='Downloading course info page',
2968 errnote='Unable to download course info page')
2970 m = re.search('<h1>([^<]+)</h1>', coursepage)
2972 info['title'] = unescapeHTML(m.group(1))
2974 info['title'] = info['id']
2976 m = re.search('<description>([^<]+)</description>', coursepage)
2978 info['description'] = unescapeHTML(m.group(1))
2980 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2983 'type': 'reference',
2984 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2988 for entry in info['list']:
2989 assert entry['type'] == 'reference'
2990 results += self.extract(entry['url'])
2994 'id': 'Stanford OpenClassroom',
2997 'upload_date': None,
3000 self.report_download_webpage(info['id'])
3001 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
3003 rootpage = compat_urllib_request.urlopen(rootURL).read()
3004 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3005 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3008 info['title'] = info['id']
3010 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3013 'type': 'reference',
3014 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3019 for entry in info['list']:
3020 assert entry['type'] == 'reference'
3021 results += self.extract(entry['url'])
3024 class MTVIE(InfoExtractor):
3025 """Information extractor for MTV.com"""
3027 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3030 def report_extraction(self, video_id):
3031 """Report information extraction."""
3032 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3034 def _real_extract(self, url):
3035 mobj = re.match(self._VALID_URL, url)
3037 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3039 if not mobj.group('proto'):
3040 url = 'http://' + url
3041 video_id = mobj.group('videoid')
3043 webpage = self._download_webpage(url, video_id)
3045 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3047 self._downloader.trouble(u'ERROR: unable to extract song name')
3049 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3050 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3052 self._downloader.trouble(u'ERROR: unable to extract performer')
3054 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3055 video_title = performer + ' - ' + song_name
3057 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3059 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3061 mtvn_uri = mobj.group(1)
3063 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3065 self._downloader.trouble(u'ERROR: unable to extract content id')
3067 content_id = mobj.group(1)
3069 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3070 self.report_extraction(video_id)
3071 request = compat_urllib_request.Request(videogen_url)
3073 metadataXml = compat_urllib_request.urlopen(request).read()
3074 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3075 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3078 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3079 renditions = mdoc.findall('.//rendition')
3081 # For now, always pick the highest quality.
3082 rendition = renditions[-1]
3085 _,_,ext = rendition.attrib['type'].partition('/')
3086 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3087 video_url = rendition.find('./src').text
3089 self._downloader.trouble('Invalid rendition field.')
3095 'uploader': performer,
3096 'upload_date': None,
3097 'title': video_title,
3105 class YoukuIE(InfoExtractor):
3106 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3108 def report_download_webpage(self, file_id):
3109 """Report webpage download."""
3110 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3112 def report_extraction(self, file_id):
3113 """Report information extraction."""
3114 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3117 nowTime = int(time.time() * 1000)
3118 random1 = random.randint(1000,1998)
3119 random2 = random.randint(1000,9999)
3121 return "%d%d%d" %(nowTime,random1,random2)
3123 def _get_file_ID_mix_string(self, seed):
3125 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3127 for i in range(len(source)):
3128 seed = (seed * 211 + 30031 ) % 65536
3129 index = math.floor(seed / 65536 * len(source) )
3130 mixed.append(source[int(index)])
3131 source.remove(source[int(index)])
3132 #return ''.join(mixed)
3135 def _get_file_id(self, fileId, seed):
3136 mixed = self._get_file_ID_mix_string(seed)
3137 ids = fileId.split('*')
3141 realId.append(mixed[int(ch)])
3142 return ''.join(realId)
3144 def _real_extract(self, url):
3145 mobj = re.match(self._VALID_URL, url)
3147 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3149 video_id = mobj.group('ID')
3151 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3153 request = compat_urllib_request.Request(info_url, None, std_headers)
3155 self.report_download_webpage(video_id)
3156 jsondata = compat_urllib_request.urlopen(request).read()
3157 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3158 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3161 self.report_extraction(video_id)
3163 jsonstr = jsondata.decode('utf-8')
3164 config = json.loads(jsonstr)
3166 video_title = config['data'][0]['title']
3167 seed = config['data'][0]['seed']
3169 format = self._downloader.params.get('format', None)
3170 supported_format = list(config['data'][0]['streamfileids'].keys())
3172 if format is None or format == 'best':
3173 if 'hd2' in supported_format:
3178 elif format == 'worst':
3186 fileid = config['data'][0]['streamfileids'][format]
3187 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3188 except (UnicodeDecodeError, ValueError, KeyError):
3189 self._downloader.trouble(u'ERROR: unable to extract info section')
3193 sid = self._gen_sid()
3194 fileid = self._get_file_id(fileid, seed)
3196 #column 8,9 of fileid represent the segment number
3197 #fileid[7:9] should be changed
3198 for index, key in enumerate(keys):
3200 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3201 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3204 'id': '%s_part%02d' % (video_id, index),
3205 'url': download_url,
3207 'upload_date': None,
3208 'title': video_title,
3211 files_info.append(info)
3216 class XNXXIE(InfoExtractor):
3217 """Information extractor for xnxx.com"""
3219 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3221 VIDEO_URL_RE = r'flv_url=(.*?)&'
3222 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3223 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3225 def report_webpage(self, video_id):
3226 """Report information extraction"""
3227 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3229 def report_extraction(self, video_id):
3230 """Report information extraction"""
3231 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3233 def _real_extract(self, url):
3234 mobj = re.match(self._VALID_URL, url)
3236 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3238 video_id = mobj.group(1)
3240 self.report_webpage(video_id)
3242 # Get webpage content
3244 webpage_bytes = compat_urllib_request.urlopen(url).read()
3245 webpage = webpage_bytes.decode('utf-8')
3246 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3247 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3250 result = re.search(self.VIDEO_URL_RE, webpage)
3252 self._downloader.trouble(u'ERROR: unable to extract video url')
3254 video_url = compat_urllib_parse.unquote(result.group(1))
3256 result = re.search(self.VIDEO_TITLE_RE, webpage)
3258 self._downloader.trouble(u'ERROR: unable to extract video title')
3260 video_title = result.group(1)
3262 result = re.search(self.VIDEO_THUMB_RE, webpage)
3264 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3266 video_thumbnail = result.group(1)
3272 'upload_date': None,
3273 'title': video_title,
3275 'thumbnail': video_thumbnail,
3276 'description': None,
3280 class GooglePlusIE(InfoExtractor):
3281 """Information extractor for plus.google.com."""
3283 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3284 IE_NAME = u'plus.google'
3286 def __init__(self, downloader=None):
3287 InfoExtractor.__init__(self, downloader)
3289 def report_extract_entry(self, url):
3290 """Report downloading extry"""
3291 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3293 def report_date(self, upload_date):
3294 """Report downloading extry"""
3295 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3297 def report_uploader(self, uploader):
3298 """Report downloading extry"""
3299 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3301 def report_title(self, video_title):
3302 """Report downloading extry"""
3303 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3305 def report_extract_vid_page(self, video_page):
3306 """Report information extraction."""
3307 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3309 def _real_extract(self, url):
3310 # Extract id from URL
3311 mobj = re.match(self._VALID_URL, url)
3313 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3316 post_url = mobj.group(0)
3317 video_id = mobj.group(1)
3319 video_extension = 'flv'
3321 # Step 1, Retrieve post webpage to extract further information
3322 self.report_extract_entry(post_url)
3323 request = compat_urllib_request.Request(post_url)
3325 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3326 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3327 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3330 # Extract update date
3332 pattern = 'title="Timestamp">(.*?)</a>'
3333 mobj = re.search(pattern, webpage)
3335 upload_date = mobj.group(1)
3336 # Convert timestring to a format suitable for filename
3337 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3338 upload_date = upload_date.strftime('%Y%m%d')
3339 self.report_date(upload_date)
3343 pattern = r'rel\="author".*?>(.*?)</a>'
3344 mobj = re.search(pattern, webpage)
3346 uploader = mobj.group(1)
3347 self.report_uploader(uploader)
3350 # Get the first line for title
3352 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3353 mobj = re.search(pattern, webpage)
3355 video_title = mobj.group(1)
3356 self.report_title(video_title)
3358 # Step 2, Stimulate clicking the image box to launch video
3359 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3360 mobj = re.search(pattern, webpage)
3362 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3364 video_page = mobj.group(1)
3365 request = compat_urllib_request.Request(video_page)
3367 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3368 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3369 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3371 self.report_extract_vid_page(video_page)
3374 # Extract video links on video page
3375 """Extract video links of all sizes"""
3376 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3377 mobj = re.findall(pattern, webpage)
3379 self._downloader.trouble(u'ERROR: unable to extract video links')
3381 # Sort in resolution
3382 links = sorted(mobj)
3384 # Choose the lowest of the sort, i.e. highest resolution
3385 video_url = links[-1]
3386 # Only get the url. The resolution part in the tuple has no use anymore
3387 video_url = video_url[-1]
3388 # Treat escaped \u0026 style hex
3390 video_url = video_url.decode("unicode_escape")
3391 except AttributeError: # Python 3
3392 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3398 'uploader': uploader,
3399 'upload_date': upload_date,
3400 'title': video_title,
3401 'ext': video_extension,
3404 class NBAIE(InfoExtractor):
3405 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3408 def _real_extract(self, url):
3409 mobj = re.match(self._VALID_URL, url)
3411 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3414 video_id = mobj.group(1)
3415 if video_id.endswith('/index.html'):
3416 video_id = video_id[:-len('/index.html')]
3418 webpage = self._download_webpage(url, video_id)
3420 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3421 def _findProp(rexp, default=None):
3422 m = re.search(rexp, webpage)
3424 return unescapeHTML(m.group(1))
3428 shortened_video_id = video_id.rpartition('/')[2]
3429 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3431 'id': shortened_video_id,
3435 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3436 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3440 class JustinTVIE(InfoExtractor):
3441 """Information extractor for justin.tv and twitch.tv"""
3442 # TODO: One broadcast may be split into multiple videos. The key
3443 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3444 # starts at 1 and increases. Can we treat all parts as one video?
3446 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3447 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3448 _JUSTIN_PAGE_LIMIT = 100
3449 IE_NAME = u'justin.tv'
3451 def report_extraction(self, file_id):
3452 """Report information extraction."""
3453 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3455 def report_download_page(self, channel, offset):
3456 """Report attempt to download a single page of videos."""
3457 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3458 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3460 # Return count of items, list of *valid* items
3461 def _parse_page(self, url):
3463 urlh = compat_urllib_request.urlopen(url)
3464 webpage_bytes = urlh.read()
3465 webpage = webpage_bytes.decode('utf-8', 'ignore')
3466 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3467 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3470 response = json.loads(webpage)
3471 if type(response) != list:
3472 error_text = response.get('error', 'unknown error')
3473 self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3476 for clip in response:
3477 video_url = clip['video_file_url']
3479 video_extension = os.path.splitext(video_url)[1][1:]
3480 video_date = re.sub('-', '', clip['start_time'][:10])
3481 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3482 video_id = clip['id']
3483 video_title = clip.get('title', video_id)
3487 'title': video_title,
3488 'uploader': clip.get('channel_name', video_uploader_id),
3489 'uploader_id': video_uploader_id,
3490 'upload_date': video_date,
3491 'ext': video_extension,
3493 return (len(response), info)
3495 def _real_extract(self, url):
3496 mobj = re.match(self._VALID_URL, url)
3498 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3501 api = 'http://api.justin.tv'
3502 video_id = mobj.group(mobj.lastindex)
3504 if mobj.lastindex == 1:
3506 api += '/channel/archives/%s.json'
3508 api += '/broadcast/by_archive/%s.json'
3509 api = api % (video_id,)
3511 self.report_extraction(video_id)
3515 limit = self._JUSTIN_PAGE_LIMIT
3518 self.report_download_page(video_id, offset)
3519 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3520 page_count, page_info = self._parse_page(page_url)
3521 info.extend(page_info)
3522 if not paged or page_count != limit:
3527 class FunnyOrDieIE(InfoExtractor):
3528 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3530 def _real_extract(self, url):
3531 mobj = re.match(self._VALID_URL, url)
3533 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3536 video_id = mobj.group('id')
3537 webpage = self._download_webpage(url, video_id)
3539 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3541 self._downloader.trouble(u'ERROR: unable to find video information')
3542 video_url = unescapeHTML(m.group('url'))
3544 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3546 self._downloader.trouble(u'Cannot find video title')
3547 title = unescapeHTML(m.group('title'))
3549 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3551 desc = unescapeHTML(m.group('desc'))
3560 'description': desc,
3564 class TweetReelIE(InfoExtractor):
3565 _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3567 def _real_extract(self, url):
3568 mobj = re.match(self._VALID_URL, url)
3570 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3573 video_id = mobj.group('id')
3574 webpage = self._download_webpage(url, video_id)
3576 m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3578 self._downloader.trouble(u'ERROR: Cannot find status ID')
3579 status_id = m.group(1)
3581 m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3583 self._downloader.trouble(u'WARNING: Cannot find description')
3584 desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3586 m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3588 self._downloader.trouble(u'ERROR: Cannot find uploader')
3589 uploader = unescapeHTML(m.group('uploader'))
3590 uploader_id = unescapeHTML(m.group('uploader_id'))
3592 m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3594 self._downloader.trouble(u'ERROR: Cannot find upload date')
3595 upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3598 video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3605 'description': desc,
3606 'uploader': uploader,
3607 'uploader_id': uploader_id,
3608 'internal_id': status_id,
3609 'upload_date': upload_date
3613 class SteamIE(InfoExtractor):
3614 _VALID_URL = r"""http://store.steampowered.com/
3615 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3617 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3620 def suitable(self, url):
3621 """Receives a URL and returns True if suitable for this IE."""
3622 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3624 def _real_extract(self, url):
3625 m = re.match(self._VALID_URL, url, re.VERBOSE)
3626 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3627 gameID = m.group('gameID')
3628 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3629 webpage = self._download_webpage(videourl, gameID)
3630 mweb = re.finditer(urlRE, webpage)
3631 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3632 titles = re.finditer(namesRE, webpage)
3633 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3634 thumbs = re.finditer(thumbsRE, webpage)
3636 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3637 video_id = vid.group('videoID')
3638 title = vtitle.group('videoName')
3639 video_url = vid.group('videoURL')
3640 video_thumb = thumb.group('thumbnail')
3642 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3647 'title': unescapeHTML(title),
3648 'thumbnail': video_thumb
3653 class UstreamIE(InfoExtractor):
3654 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3655 IE_NAME = u'ustream'
3657 def _real_extract(self, url):
3658 m = re.match(self._VALID_URL, url)
3659 video_id = m.group('videoID')
3660 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3661 webpage = self._download_webpage(url, video_id)
3662 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3663 title = m.group('title')
3664 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3665 uploader = m.group('uploader')
3671 'uploader': uploader
3675 class RBMARadioIE(InfoExtractor):
3676 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3678 def _real_extract(self, url):
3679 m = re.match(self._VALID_URL, url)
3680 video_id = m.group('videoID')
3682 webpage = self._download_webpage(url, video_id)
3683 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3685 raise ExtractorError(u'Cannot find metadata')
3686 json_data = m.group(1)
3689 data = json.loads(json_data)
3690 except ValueError as e:
3691 raise ExtractorError(u'Invalid JSON: ' + str(e))
3693 video_url = data['akamai_url'] + '&cbr=256'
3694 url_parts = compat_urllib_parse_urlparse(video_url)
3695 video_ext = url_parts.path.rpartition('.')[2]
3700 'title': data['title'],
3701 'description': data.get('teaser_text'),
3702 'location': data.get('country_of_origin'),
3703 'uploader': data.get('host', {}).get('name'),
3704 'uploader_id': data.get('host', {}).get('slug'),
3705 'thumbnail': data.get('image', {}).get('large_url_2x'),
3706 'duration': data.get('duration'),
3711 class YouPornIE(InfoExtractor):
3712 """Information extractor for youporn.com."""
3713 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3715 def _print_formats(self, formats):
3716 """Print all available formats"""
3717 print(u'Available formats:')
3718 print(u'ext\t\tformat')
3719 print(u'---------------------------------')
3720 for format in formats:
3721 print(u'%s\t\t%s' % (format['ext'], format['format']))
3723 def _specific(self, req_format, formats):
3725 if(x["format"]==req_format):
3729 def _real_extract(self, url):
3730 mobj = re.match(self._VALID_URL, url)
3732 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3735 video_id = mobj.group('videoid')
3737 req = compat_urllib_request.Request(url)
3738 req.add_header('Cookie', 'age_verified=1')
3739 webpage = self._download_webpage(req, video_id)
3741 # Get the video title
3742 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3744 raise ExtractorError(u'Unable to extract video title')
3745 video_title = result.group('title').strip()
3747 # Get the video date
3748 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3750 self._downloader.to_stderr(u'WARNING: unable to extract video date')
3753 upload_date = result.group('date').strip()
3755 # Get the video uploader
3756 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3758 self._downloader.to_stderr(u'WARNING: unable to extract uploader')
3759 video_uploader = None
3761 video_uploader = result.group('uploader').strip()
3762 video_uploader = clean_html( video_uploader )
3764 # Get all of the formats available
3765 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3766 result = re.search(DOWNLOAD_LIST_RE, webpage)
3768 raise ExtractorError(u'Unable to extract download list')
3769 download_list_html = result.group('download_list').strip()
3771 # Get all of the links from the page
3772 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3773 links = re.findall(LINK_RE, download_list_html)
3774 if(len(links) == 0):
3775 raise ExtractorError(u'ERROR: no known formats available for video')
3777 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3782 # A link looks like this:
3783 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3784 # A path looks like this:
3785 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3786 video_url = unescapeHTML( link )
3787 path = compat_urllib_parse_urlparse( video_url ).path
3788 extension = os.path.splitext( path )[1][1:]
3789 format = path.split('/')[4].split('_')[:2]
3792 format = "-".join( format )
3793 title = u'%s-%s-%s' % (video_title, size, bitrate)
3798 'uploader': video_uploader,
3799 'upload_date': upload_date,
3804 'description': None,
3808 if self._downloader.params.get('listformats', None):
3809 self._print_formats(formats)
3812 req_format = self._downloader.params.get('format', None)
3813 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3815 if req_format is None or req_format == 'best':
3817 elif req_format == 'worst':
3818 return [formats[-1]]
3819 elif req_format in ('-1', 'all'):
3822 format = self._specific( req_format, formats )
3824 self._downloader.trouble(u'ERROR: requested format not available')
3830 class PornotubeIE(InfoExtractor):
3831 """Information extractor for pornotube.com."""
3832 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3834 def _real_extract(self, url):
3835 mobj = re.match(self._VALID_URL, url)
3837 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3840 video_id = mobj.group('videoid')
3841 video_title = mobj.group('title')
3843 # Get webpage content
3844 webpage = self._download_webpage(url, video_id)
3847 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3848 result = re.search(VIDEO_URL_RE, webpage)
3850 self._downloader.trouble(u'ERROR: unable to extract video url')
3852 video_url = compat_urllib_parse.unquote(result.group('url'))
3854 #Get the uploaded date
3855 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3856 result = re.search(VIDEO_UPLOADED_RE, webpage)
3858 self._downloader.trouble(u'ERROR: unable to extract video title')
3860 upload_date = result.group('date')
3862 info = {'id': video_id,
3865 'upload_date': upload_date,
3866 'title': video_title,
3872 class YouJizzIE(InfoExtractor):
3873 """Information extractor for youjizz.com."""
3874 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3876 def _real_extract(self, url):
3877 mobj = re.match(self._VALID_URL, url)
3879 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3882 video_id = mobj.group('videoid')
3884 # Get webpage content
3885 webpage = self._download_webpage(url, video_id)
3887 # Get the video title
3888 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3890 raise ExtractorError(u'ERROR: unable to extract video title')
3891 video_title = result.group('title').strip()
3893 # Get the embed page
3894 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3896 raise ExtractorError(u'ERROR: unable to extract embed page')
3898 embed_page_url = result.group(0).strip()
3899 video_id = result.group('videoid')
3901 webpage = self._download_webpage(embed_page_url, video_id)
3904 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3906 raise ExtractorError(u'ERROR: unable to extract video url')
3907 video_url = result.group('source')
3909 info = {'id': video_id,
3911 'title': video_title,
3914 'player_url': embed_page_url}
3918 class EightTracksIE(InfoExtractor):
3920 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3922 def _real_extract(self, url):
3923 mobj = re.match(self._VALID_URL, url)
3925 raise ExtractorError(u'Invalid URL: %s' % url)
3926 playlist_id = mobj.group('id')
3928 webpage = self._download_webpage(url, playlist_id)
3930 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3932 raise ExtractorError(u'Cannot find trax information')
3933 json_like = m.group(1)
3934 data = json.loads(json_like)
3936 session = str(random.randint(0, 1000000000))
3938 track_count = data['tracks_count']
3939 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3940 next_url = first_url
3942 for i in itertools.count():
3943 api_json = self._download_webpage(next_url, playlist_id,
3944 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3945 errnote=u'Failed to download song information')
3946 api_data = json.loads(api_json)
3947 track_data = api_data[u'set']['track']
3949 'id': track_data['id'],
3950 'url': track_data['track_file_stream_url'],
3951 'title': track_data['performer'] + u' - ' + track_data['name'],
3952 'raw_title': track_data['name'],
3953 'uploader_id': data['user']['login'],
3957 if api_data['set']['at_last_track']:
3959 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3962 class KeekIE(InfoExtractor):
3963 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3966 def _real_extract(self, url):
3967 m = re.match(self._VALID_URL, url)
3968 video_id = m.group('videoID')
3969 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3970 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3971 webpage = self._download_webpage(url, video_id)
3972 m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
3973 title = unescapeHTML(m.group('title'))
3974 m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
3975 uploader = unescapeHTML(m.group('uploader'))
3981 'thumbnail': thumbnail,
3982 'uploader': uploader
3986 class TEDIE(InfoExtractor):
3987 _VALID_URL=r'''http://www.ted.com/
3989 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3991 ((?P<type_talk>talks)) # We have a simple talk
3993 /(?P<name>\w+) # Here goes the name and then ".html"
3996 def suitable(self, url):
3997 """Receives a URL and returns True if suitable for this IE."""
3998 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
4000 def _real_extract(self, url):
4001 m=re.match(self._VALID_URL, url, re.VERBOSE)
4002 if m.group('type_talk'):
4003 return [self._talk_info(url)]
4005 playlist_id=m.group('playlist_id')
4006 name=m.group('name')
4007 self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4008 return self._playlist_videos_info(url,name,playlist_id)
4010 def _talk_video_link(self,mediaSlug):
4011 '''Returns the video link for that mediaSlug'''
4012 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4014 def _playlist_videos_info(self,url,name,playlist_id=0):
4015 '''Returns the videos of the playlist'''
4017 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4018 ([.\s]*?)data-playlist_item_id="(\d+)"
4019 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4021 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4022 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4023 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4024 m_names=re.finditer(video_name_RE,webpage)
4026 for m_video, m_name in zip(m_videos,m_names):
4027 video_id=m_video.group('video_id')
4028 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4029 info.append(self._talk_info(talk_url,video_id))
4032 def _talk_info(self, url, video_id=0):
4033 """Return the video for the talk in the url"""
4034 m=re.match(self._VALID_URL, url,re.VERBOSE)
4035 videoName=m.group('name')
4036 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4037 # If the url includes the language we get the title translated
4038 title_RE=r'<h1><span id="altHeadline" >(?P<title>.*)</span></h1>'
4039 title=re.search(title_RE, webpage).group('title')
4040 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4041 "id":(?P<videoID>[\d]+).*?
4042 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4043 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4044 thumb_match=re.search(thumb_RE,webpage)
4045 info_match=re.search(info_RE,webpage,re.VERBOSE)
4046 video_id=info_match.group('videoID')
4047 mediaSlug=info_match.group('mediaSlug')
4048 video_url=self._talk_video_link(mediaSlug)
4054 'thumbnail': thumb_match.group('thumbnail')
4058 class MySpassIE(InfoExtractor):
4059 _VALID_URL = r'http://www.myspass.de/.*'
4061 def _real_extract(self, url):
4062 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4064 # video id is the last path element of the URL
4065 # usually there is a trailing slash, so also try the second but last
4066 url_path = compat_urllib_parse_urlparse(url).path
4067 url_parent_path, video_id = os.path.split(url_path)
4069 _, video_id = os.path.split(url_parent_path)
4072 metadata_url = META_DATA_URL_TEMPLATE % video_id
4073 metadata_text = self._download_webpage(metadata_url, video_id)
4074 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4076 # extract values from metadata
4077 url_flv_el = metadata.find('url_flv')
4078 if url_flv_el is None:
4079 self._downloader.trouble(u'ERROR: unable to extract download url')
4081 video_url = url_flv_el.text
4082 extension = os.path.splitext(video_url)[1][1:]
4083 title_el = metadata.find('title')
4084 if title_el is None:
4085 self._downloader.trouble(u'ERROR: unable to extract title')
4087 title = title_el.text
4088 format_id_el = metadata.find('format_id')
4089 if format_id_el is None:
4092 format = format_id_el.text
4093 description_el = metadata.find('description')
4094 if description_el is not None:
4095 description = description_el.text
4098 imagePreview_el = metadata.find('imagePreview')
4099 if imagePreview_el is not None:
4100 thumbnail = imagePreview_el.text
4109 'thumbnail': thumbnail,
4110 'description': description
4114 def gen_extractors():
4115 """ Return a list of an instance of every supported extractor.
4116 The order does matter; the first extractor matched is the one handling the URL.
4119 YoutubePlaylistIE(),
4143 StanfordOpenClassroomIE(),