2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
22 class InfoExtractor(object):
23 """Information Extractor class.
25 Information extractors are the classes that, given a URL, extract
26 information about the video (or videos) the URL refers to. This
27 information includes the real video URL, the video title, author and
28 others. The information is stored in a dictionary which is then
29 passed to the FileDownloader. The FileDownloader processes this
30 information possibly downloading the video to the file system, among
31 other possible outcomes.
33 The dictionaries must include the following fields:
37 title: Video title, unescaped.
38 ext: Video filename extension.
40 The following fields are optional:
42 format: The video format, defaults to ext (used for --get-format)
43 thumbnail: Full URL to a video thumbnail image.
44 description: One-line video description.
45 uploader: Full name of the video uploader.
46 upload_date: Video upload date (YYYYMMDD).
47 uploader_id: Nickname or id of the video uploader.
48 location: Physical location of the video.
49 player_url: SWF Player URL (used for rtmpdump).
50 subtitles: The .srt file contents.
51 urlhandle: [internal] The urlHandle to be used to download the file,
52 like returned by urllib.request.urlopen
54 The fields should all be Unicode strings.
56 Subclasses of this one should re-define the _real_initialize() and
57 _real_extract() methods and define a _VALID_URL regexp.
58 Probably, they should also be added to the list of extractors.
60 _real_extract() must return a *list* of information dictionaries as
63 Finally, the _WORKING attribute should be set to False for broken IEs
64 in order to warn the users and skip the tests.
71 def __init__(self, downloader=None):
72 """Constructor. Receives an optional downloader."""
74 self.set_downloader(downloader)
76 def suitable(self, url):
77 """Receives a URL and returns True if suitable for this IE."""
78 return re.match(self._VALID_URL, url) is not None
81 """Getter method for _WORKING."""
85 """Initializes an instance (authentication, etc)."""
87 self._real_initialize()
90 def extract(self, url):
91 """Extracts URL information and returns it in list of dicts."""
93 return self._real_extract(url)
95 def set_downloader(self, downloader):
96 """Sets the downloader for this IE."""
97 self._downloader = downloader
99 def _real_initialize(self):
100 """Real initialization process. Redefine in subclasses."""
103 def _real_extract(self, url):
104 """Real extraction process. Redefine in subclasses."""
109 return type(self).__name__[:-2]
111 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
112 """ Returns the response handle """
114 note = u'Downloading video webpage'
115 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
117 return compat_urllib_request.urlopen(url_or_request)
118 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
120 errnote = u'Unable to download webpage'
121 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
123 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
124 """ Returns the data of the page as a string """
125 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
126 webpage_bytes = urlh.read()
127 return webpage_bytes.decode('utf-8', 'replace')
130 class YoutubeIE(InfoExtractor):
131 """Information extractor for youtube.com."""
135 (?:https?://)? # http(s):// (optional)
136 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
137 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
138 (?:.*?\#/)? # handle anchor (#/) redirect urls
139 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
140 (?: # the various things that can precede the ID:
141 (?:(?:v|embed|e)/) # v/ or embed/ or e/
142 |(?: # or the v= param in all its forms
143 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
144 (?:\?|\#!?) # the params delimiter ? or # or #!
145 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
148 )? # optional -> youtube.com/xxxx is OK
149 )? # all until now is optional -> you can pass the naked ID
150 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
151 (?(1).+)? # if we found the ID, everything can follow
153 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
154 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
155 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
156 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
157 _NETRC_MACHINE = 'youtube'
158 # Listed in order of quality
159 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
160 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
161 _video_extensions = {
167 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
173 _video_dimensions = {
191 def suitable(self, url):
192 """Receives a URL and returns True if suitable for this IE."""
193 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
195 def report_lang(self):
196 """Report attempt to set language."""
197 self._downloader.to_screen(u'[youtube] Setting language')
199 def report_login(self):
200 """Report attempt to log in."""
201 self._downloader.to_screen(u'[youtube] Logging in')
203 def report_age_confirmation(self):
204 """Report attempt to confirm age."""
205 self._downloader.to_screen(u'[youtube] Confirming age')
207 def report_video_webpage_download(self, video_id):
208 """Report attempt to download video webpage."""
209 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
211 def report_video_info_webpage_download(self, video_id):
212 """Report attempt to download video info webpage."""
213 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
215 def report_video_subtitles_download(self, video_id):
216 """Report attempt to download video info webpage."""
217 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
219 def report_information_extraction(self, video_id):
220 """Report attempt to extract video information."""
221 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
223 def report_unavailable_format(self, video_id, format):
224 """Report extracted video URL."""
225 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
227 def report_rtmp_download(self):
228 """Indicate the download will use the RTMP protocol."""
229 self._downloader.to_screen(u'[youtube] RTMP download detected')
231 def _closed_captions_xml_to_srt(self, xml_string):
233 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
234 # TODO parse xml instead of regex
235 for n, (start, dur_tag, dur, caption) in enumerate(texts):
236 if not dur: dur = '4'
238 end = start + float(dur)
239 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
240 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
241 caption = unescapeHTML(caption)
242 caption = unescapeHTML(caption) # double cycle, intentional
243 srt += str(n+1) + '\n'
244 srt += start + ' --> ' + end + '\n'
245 srt += caption + '\n\n'
248 def _extract_subtitles(self, video_id):
249 self.report_video_subtitles_download(video_id)
250 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
252 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
253 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
254 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
255 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
256 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
257 if not srt_lang_list:
258 return (u'WARNING: video has no closed captions', None)
259 if self._downloader.params.get('subtitleslang', False):
260 srt_lang = self._downloader.params.get('subtitleslang')
261 elif 'en' in srt_lang_list:
264 srt_lang = list(srt_lang_list.keys())[0]
265 if not srt_lang in srt_lang_list:
266 return (u'WARNING: no closed captions found in the specified language', None)
267 params = compat_urllib_parse.urlencode({
269 'name': srt_lang_list[srt_lang].encode('utf-8'),
272 url = 'http://www.youtube.com/api/timedtext?' + params
274 srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8')
275 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
276 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
278 return (u'WARNING: Did not fetch video subtitles', None)
279 return (None, self._closed_captions_xml_to_srt(srt_xml))
281 def _print_formats(self, formats):
282 print('Available formats:')
284 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
286 def _real_initialize(self):
287 if self._downloader is None:
292 downloader_params = self._downloader.params
294 # Attempt to use provided username and password or .netrc data
295 if downloader_params.get('username', None) is not None:
296 username = downloader_params['username']
297 password = downloader_params['password']
298 elif downloader_params.get('usenetrc', False):
300 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
305 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
306 except (IOError, netrc.NetrcParseError) as err:
307 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
311 request = compat_urllib_request.Request(self._LANG_URL)
314 compat_urllib_request.urlopen(request).read()
315 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
316 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
319 # No authentication to be performed
323 request = compat_urllib_request.Request(self._LOGIN_URL)
325 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
326 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
327 self._downloader.to_stderr(u'WARNING: unable to fetch login page: %s' % compat_str(err))
332 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
334 galx = match.group(1)
336 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
342 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
346 u'PersistentCookie': u'yes',
348 u'bgresponse': u'js_disabled',
349 u'checkConnection': u'',
350 u'checkedDomains': u'youtube',
356 u'signIn': u'Sign in',
358 u'service': u'youtube',
362 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
364 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
365 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
366 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
369 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
370 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
371 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
373 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
374 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
380 'action_confirm': 'Confirm',
382 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
384 self.report_age_confirmation()
385 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
386 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
387 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
390 def _extract_id(self, url):
391 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
393 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
395 video_id = mobj.group(2)
398 def _real_extract(self, url):
399 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
400 mobj = re.search(self._NEXT_URL_RE, url)
402 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
403 video_id = self._extract_id(url)
406 self.report_video_webpage_download(video_id)
407 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
408 request = compat_urllib_request.Request(url)
410 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
411 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
412 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
415 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
417 # Attempt to extract SWF player URL
418 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
420 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
425 self.report_video_info_webpage_download(video_id)
426 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
427 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
428 % (video_id, el_type))
429 request = compat_urllib_request.Request(video_info_url)
431 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
432 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
433 video_info = compat_parse_qs(video_info_webpage)
434 if 'token' in video_info:
436 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
437 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
439 if 'token' not in video_info:
440 if 'reason' in video_info:
441 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
443 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
446 # Check for "rental" videos
447 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
448 self._downloader.trouble(u'ERROR: "rental" videos not supported')
451 # Start extracting information
452 self.report_information_extraction(video_id)
455 if 'author' not in video_info:
456 self._downloader.trouble(u'ERROR: unable to extract uploader name')
458 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
461 video_uploader_id = None
462 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
464 video_uploader_id = mobj.group(1)
466 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
469 if 'title' not in video_info:
470 self._downloader.trouble(u'ERROR: unable to extract video title')
472 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
475 if 'thumbnail_url' not in video_info:
476 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
478 else: # don't panic if we can't find it
479 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
483 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
485 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
486 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
487 for expression in format_expressions:
489 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
494 video_description = get_element_by_id("eow-description", video_webpage)
495 if video_description:
496 video_description = clean_html(video_description)
498 video_description = ''
501 video_subtitles = None
502 if self._downloader.params.get('writesubtitles', False):
503 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
505 self._downloader.trouble(srt_error)
507 if 'length_seconds' not in video_info:
508 self._downloader.trouble(u'WARNING: unable to extract video duration')
511 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
514 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
516 # Decide which formats to download
517 req_format = self._downloader.params.get('format', None)
519 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
520 self.report_rtmp_download()
521 video_url_list = [(None, video_info['conn'][0])]
522 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
523 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
524 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
525 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
526 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
528 format_limit = self._downloader.params.get('format_limit', None)
529 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
530 if format_limit is not None and format_limit in available_formats:
531 format_list = available_formats[available_formats.index(format_limit):]
533 format_list = available_formats
534 existing_formats = [x for x in format_list if x in url_map]
535 if len(existing_formats) == 0:
536 self._downloader.trouble(u'ERROR: no known formats available for video')
538 if self._downloader.params.get('listformats', None):
539 self._print_formats(existing_formats)
541 if req_format is None or req_format == 'best':
542 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
543 elif req_format == 'worst':
544 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
545 elif req_format in ('-1', 'all'):
546 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
548 # Specific formats. We pick the first in a slash-delimeted sequence.
549 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
550 req_formats = req_format.split('/')
551 video_url_list = None
552 for rf in req_formats:
554 video_url_list = [(rf, url_map[rf])]
556 if video_url_list is None:
557 self._downloader.trouble(u'ERROR: requested format not available')
560 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
564 for format_param, video_real_url in video_url_list:
566 video_extension = self._video_extensions.get(format_param, 'flv')
568 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
569 self._video_dimensions.get(format_param, '???'))
573 'url': video_real_url,
574 'uploader': video_uploader,
575 'uploader_id': video_uploader_id,
576 'upload_date': upload_date,
577 'title': video_title,
578 'ext': video_extension,
579 'format': video_format,
580 'thumbnail': video_thumbnail,
581 'description': video_description,
582 'player_url': player_url,
583 'subtitles': video_subtitles,
584 'duration': video_duration
589 class MetacafeIE(InfoExtractor):
590 """Information Extractor for metacafe.com."""
592 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
593 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
594 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
595 IE_NAME = u'metacafe'
597 def __init__(self, downloader=None):
598 InfoExtractor.__init__(self, downloader)
600 def report_disclaimer(self):
601 """Report disclaimer retrieval."""
602 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
604 def report_age_confirmation(self):
605 """Report attempt to confirm age."""
606 self._downloader.to_screen(u'[metacafe] Confirming age')
608 def report_download_webpage(self, video_id):
609 """Report webpage download."""
610 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
612 def report_extraction(self, video_id):
613 """Report information extraction."""
614 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
616 def _real_initialize(self):
617 # Retrieve disclaimer
618 request = compat_urllib_request.Request(self._DISCLAIMER)
620 self.report_disclaimer()
621 disclaimer = compat_urllib_request.urlopen(request).read()
622 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
623 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
629 'submit': "Continue - I'm over 18",
631 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
633 self.report_age_confirmation()
634 disclaimer = compat_urllib_request.urlopen(request).read()
635 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
636 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
639 def _real_extract(self, url):
640 # Extract id and simplified title from URL
641 mobj = re.match(self._VALID_URL, url)
643 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
646 video_id = mobj.group(1)
648 # Check if video comes from YouTube
649 mobj2 = re.match(r'^yt-(.*)$', video_id)
650 if mobj2 is not None:
651 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
654 # Retrieve video webpage to extract further information
655 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
657 self.report_download_webpage(video_id)
658 webpage = compat_urllib_request.urlopen(request).read()
659 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
660 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
663 # Extract URL, uploader and title from webpage
664 self.report_extraction(video_id)
665 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
667 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
668 video_extension = mediaURL[-3:]
670 # Extract gdaKey if available
671 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
675 gdaKey = mobj.group(1)
676 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
678 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
680 self._downloader.trouble(u'ERROR: unable to extract media URL')
682 vardict = compat_parse_qs(mobj.group(1))
683 if 'mediaData' not in vardict:
684 self._downloader.trouble(u'ERROR: unable to extract media URL')
686 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
688 self._downloader.trouble(u'ERROR: unable to extract media URL')
690 mediaURL = mobj.group(1).replace('\\/', '/')
691 video_extension = mediaURL[-3:]
692 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
694 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
696 self._downloader.trouble(u'ERROR: unable to extract title')
698 video_title = mobj.group(1).decode('utf-8')
700 mobj = re.search(r'submitter=(.*?);', webpage)
702 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
704 video_uploader = mobj.group(1)
707 'id': video_id.decode('utf-8'),
708 'url': video_url.decode('utf-8'),
709 'uploader': video_uploader.decode('utf-8'),
711 'title': video_title,
712 'ext': video_extension.decode('utf-8'),
716 class DailymotionIE(InfoExtractor):
717 """Information Extractor for Dailymotion"""
719 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
720 IE_NAME = u'dailymotion'
723 def __init__(self, downloader=None):
724 InfoExtractor.__init__(self, downloader)
726 def report_extraction(self, video_id):
727 """Report information extraction."""
728 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
730 def _real_extract(self, url):
731 # Extract id and simplified title from URL
732 mobj = re.match(self._VALID_URL, url)
734 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
737 video_id = mobj.group(1).split('_')[0].split('?')[0]
739 video_extension = 'mp4'
741 # Retrieve video webpage to extract further information
742 request = compat_urllib_request.Request(url)
743 request.add_header('Cookie', 'family_filter=off')
744 webpage = self._download_webpage(request, video_id)
746 # Extract URL, uploader and title from webpage
747 self.report_extraction(video_id)
748 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
750 self._downloader.trouble(u'ERROR: unable to extract media URL')
752 flashvars = compat_urllib_parse.unquote(mobj.group(1))
754 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
757 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
760 self._downloader.trouble(u'ERROR: unable to extract video URL')
763 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
765 self._downloader.trouble(u'ERROR: unable to extract video URL')
768 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
770 # TODO: support choosing qualities
772 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
774 self._downloader.trouble(u'ERROR: unable to extract title')
776 video_title = unescapeHTML(mobj.group('title'))
778 video_uploader = None
779 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
781 # lookin for official user
782 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
783 if mobj_official is None:
784 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
786 video_uploader = mobj_official.group(1)
788 video_uploader = mobj.group(1)
790 video_upload_date = None
791 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
793 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
798 'uploader': video_uploader,
799 'upload_date': video_upload_date,
800 'title': video_title,
801 'ext': video_extension,
805 class PhotobucketIE(InfoExtractor):
806 """Information extractor for photobucket.com."""
808 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
809 IE_NAME = u'photobucket'
811 def __init__(self, downloader=None):
812 InfoExtractor.__init__(self, downloader)
814 def report_download_webpage(self, video_id):
815 """Report webpage download."""
816 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
818 def report_extraction(self, video_id):
819 """Report information extraction."""
820 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
822 def _real_extract(self, url):
823 # Extract id from URL
824 mobj = re.match(self._VALID_URL, url)
826 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
829 video_id = mobj.group(1)
831 video_extension = 'flv'
833 # Retrieve video webpage to extract further information
834 request = compat_urllib_request.Request(url)
836 self.report_download_webpage(video_id)
837 webpage = compat_urllib_request.urlopen(request).read()
838 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
839 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
842 # Extract URL, uploader, and title from webpage
843 self.report_extraction(video_id)
844 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
846 self._downloader.trouble(u'ERROR: unable to extract media URL')
848 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
852 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
854 self._downloader.trouble(u'ERROR: unable to extract title')
856 video_title = mobj.group(1).decode('utf-8')
858 video_uploader = mobj.group(2).decode('utf-8')
861 'id': video_id.decode('utf-8'),
862 'url': video_url.decode('utf-8'),
863 'uploader': video_uploader,
865 'title': video_title,
866 'ext': video_extension.decode('utf-8'),
870 class YahooIE(InfoExtractor):
871 """Information extractor for video.yahoo.com."""
874 # _VALID_URL matches all Yahoo! Video URLs
875 # _VPAGE_URL matches only the extractable '/watch/' URLs
876 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
877 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
878 IE_NAME = u'video.yahoo'
880 def __init__(self, downloader=None):
881 InfoExtractor.__init__(self, downloader)
883 def report_download_webpage(self, video_id):
884 """Report webpage download."""
885 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
887 def report_extraction(self, video_id):
888 """Report information extraction."""
889 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
891 def _real_extract(self, url, new_video=True):
892 # Extract ID from URL
893 mobj = re.match(self._VALID_URL, url)
895 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
898 video_id = mobj.group(2)
899 video_extension = 'flv'
901 # Rewrite valid but non-extractable URLs as
902 # extractable English language /watch/ URLs
903 if re.match(self._VPAGE_URL, url) is None:
904 request = compat_urllib_request.Request(url)
906 webpage = compat_urllib_request.urlopen(request).read()
907 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
908 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
911 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
913 self._downloader.trouble(u'ERROR: Unable to extract id field')
915 yahoo_id = mobj.group(1)
917 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
919 self._downloader.trouble(u'ERROR: Unable to extract vid field')
921 yahoo_vid = mobj.group(1)
923 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
924 return self._real_extract(url, new_video=False)
926 # Retrieve video webpage to extract further information
927 request = compat_urllib_request.Request(url)
929 self.report_download_webpage(video_id)
930 webpage = compat_urllib_request.urlopen(request).read()
931 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
932 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
935 # Extract uploader and title from webpage
936 self.report_extraction(video_id)
937 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
939 self._downloader.trouble(u'ERROR: unable to extract video title')
941 video_title = mobj.group(1).decode('utf-8')
943 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
945 self._downloader.trouble(u'ERROR: unable to extract video uploader')
947 video_uploader = mobj.group(1).decode('utf-8')
949 # Extract video thumbnail
950 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
952 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
954 video_thumbnail = mobj.group(1).decode('utf-8')
956 # Extract video description
957 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
959 self._downloader.trouble(u'ERROR: unable to extract video description')
961 video_description = mobj.group(1).decode('utf-8')
962 if not video_description:
963 video_description = 'No description available.'
965 # Extract video height and width
966 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
968 self._downloader.trouble(u'ERROR: unable to extract video height')
970 yv_video_height = mobj.group(1)
972 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
974 self._downloader.trouble(u'ERROR: unable to extract video width')
976 yv_video_width = mobj.group(1)
978 # Retrieve video playlist to extract media URL
979 # I'm not completely sure what all these options are, but we
980 # seem to need most of them, otherwise the server sends a 401.
981 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
982 yv_bitrate = '700' # according to Wikipedia this is hard-coded
983 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
984 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
985 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
987 self.report_download_webpage(video_id)
988 webpage = compat_urllib_request.urlopen(request).read()
989 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
990 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
993 # Extract media URL from playlist XML
994 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
996 self._downloader.trouble(u'ERROR: Unable to extract media URL')
998 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
999 video_url = unescapeHTML(video_url)
1002 'id': video_id.decode('utf-8'),
1004 'uploader': video_uploader,
1005 'upload_date': None,
1006 'title': video_title,
1007 'ext': video_extension.decode('utf-8'),
1008 'thumbnail': video_thumbnail.decode('utf-8'),
1009 'description': video_description,
1013 class VimeoIE(InfoExtractor):
1014 """Information extractor for vimeo.com."""
1016 # _VALID_URL matches Vimeo URLs
1017 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1020 def __init__(self, downloader=None):
1021 InfoExtractor.__init__(self, downloader)
1023 def report_download_webpage(self, video_id):
1024 """Report webpage download."""
1025 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1027 def report_extraction(self, video_id):
1028 """Report information extraction."""
1029 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1031 def _real_extract(self, url, new_video=True):
1032 # Extract ID from URL
1033 mobj = re.match(self._VALID_URL, url)
1035 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1038 video_id = mobj.group('id')
1039 if not mobj.group('proto'):
1040 url = 'https://' + url
1041 if mobj.group('direct_link'):
1042 url = 'https://vimeo.com/' + video_id
1044 # Retrieve video webpage to extract further information
1045 request = compat_urllib_request.Request(url, None, std_headers)
1047 self.report_download_webpage(video_id)
1048 webpage_bytes = compat_urllib_request.urlopen(request).read()
1049 webpage = webpage_bytes.decode('utf-8')
1050 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1051 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1054 # Now we begin extracting as much information as we can from what we
1055 # retrieved. First we extract the information common to all extractors,
1056 # and latter we extract those that are Vimeo specific.
1057 self.report_extraction(video_id)
1059 # Extract the config JSON
1061 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1062 config = json.loads(config)
1064 self._downloader.trouble(u'ERROR: unable to extract info section')
1068 video_title = config["video"]["title"]
1070 # Extract uploader and uploader_id
1071 video_uploader = config["video"]["owner"]["name"]
1072 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1074 # Extract video thumbnail
1075 video_thumbnail = config["video"]["thumbnail"]
1077 # Extract video description
1078 video_description = get_element_by_attribute("itemprop", "description", webpage)
1079 if video_description: video_description = clean_html(video_description)
1080 else: video_description = ''
1082 # Extract upload date
1083 video_upload_date = None
1084 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1085 if mobj is not None:
1086 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1088 # Vimeo specific: extract request signature and timestamp
1089 sig = config['request']['signature']
1090 timestamp = config['request']['timestamp']
1092 # Vimeo specific: extract video codec and quality information
1093 # First consider quality, then codecs, then take everything
1094 # TODO bind to format param
1095 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1096 files = { 'hd': [], 'sd': [], 'other': []}
1097 for codec_name, codec_extension in codecs:
1098 if codec_name in config["video"]["files"]:
1099 if 'hd' in config["video"]["files"][codec_name]:
1100 files['hd'].append((codec_name, codec_extension, 'hd'))
1101 elif 'sd' in config["video"]["files"][codec_name]:
1102 files['sd'].append((codec_name, codec_extension, 'sd'))
1104 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1106 for quality in ('hd', 'sd', 'other'):
1107 if len(files[quality]) > 0:
1108 video_quality = files[quality][0][2]
1109 video_codec = files[quality][0][0]
1110 video_extension = files[quality][0][1]
1111 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1114 self._downloader.trouble(u'ERROR: no known codec found')
1117 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1118 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1123 'uploader': video_uploader,
1124 'uploader_id': video_uploader_id,
1125 'upload_date': video_upload_date,
1126 'title': video_title,
1127 'ext': video_extension,
1128 'thumbnail': video_thumbnail,
1129 'description': video_description,
1133 class ArteTvIE(InfoExtractor):
1134 """arte.tv information extractor."""
1136 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1137 _LIVE_URL = r'index-[0-9]+\.html$'
1139 IE_NAME = u'arte.tv'
1141 def __init__(self, downloader=None):
1142 InfoExtractor.__init__(self, downloader)
1144 def report_download_webpage(self, video_id):
1145 """Report webpage download."""
1146 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1148 def report_extraction(self, video_id):
1149 """Report information extraction."""
1150 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1152 def fetch_webpage(self, url):
1153 request = compat_urllib_request.Request(url)
1155 self.report_download_webpage(url)
1156 webpage = compat_urllib_request.urlopen(request).read()
1157 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1158 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1160 except ValueError as err:
1161 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1165 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1166 page = self.fetch_webpage(url)
1167 mobj = re.search(regex, page, regexFlags)
1171 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1174 for (i, key, err) in matchTuples:
1175 if mobj.group(i) is None:
1176 self._downloader.trouble(err)
1179 info[key] = mobj.group(i)
1183 def extractLiveStream(self, url):
1184 video_lang = url.split('/')[-4]
1185 info = self.grep_webpage(
1187 r'src="(.*?/videothek_js.*?\.js)',
1190 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1193 http_host = url.split('/')[2]
1194 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1195 info = self.grep_webpage(
1197 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1198 '(http://.*?\.swf).*?' +
1202 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1203 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1204 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1207 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1209 def extractPlus7Stream(self, url):
1210 video_lang = url.split('/')[-3]
1211 info = self.grep_webpage(
1213 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1216 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1219 next_url = compat_urllib_parse.unquote(info.get('url'))
1220 info = self.grep_webpage(
1222 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1225 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1228 next_url = compat_urllib_parse.unquote(info.get('url'))
1230 info = self.grep_webpage(
1232 r'<video id="(.*?)".*?>.*?' +
1233 '<name>(.*?)</name>.*?' +
1234 '<dateVideo>(.*?)</dateVideo>.*?' +
1235 '<url quality="hd">(.*?)</url>',
1238 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1239 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1240 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1241 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1246 'id': info.get('id'),
1247 'url': compat_urllib_parse.unquote(info.get('url')),
1248 'uploader': u'arte.tv',
1249 'upload_date': info.get('date'),
1250 'title': info.get('title').decode('utf-8'),
1256 def _real_extract(self, url):
1257 video_id = url.split('/')[-1]
1258 self.report_extraction(video_id)
1260 if re.search(self._LIVE_URL, video_id) is not None:
1261 self.extractLiveStream(url)
1264 info = self.extractPlus7Stream(url)
1269 class GenericIE(InfoExtractor):
1270 """Generic last-resort information extractor."""
1273 IE_NAME = u'generic'
1275 def __init__(self, downloader=None):
1276 InfoExtractor.__init__(self, downloader)
1278 def report_download_webpage(self, video_id):
1279 """Report webpage download."""
1280 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1281 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1283 def report_extraction(self, video_id):
1284 """Report information extraction."""
1285 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1287 def report_following_redirect(self, new_url):
1288 """Report information extraction."""
1289 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1291 def _test_redirect(self, url):
1292 """Check if it is a redirect, like url shorteners, in case restart chain."""
1293 class HeadRequest(compat_urllib_request.Request):
1294 def get_method(self):
1297 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1299 Subclass the HTTPRedirectHandler to make it use our
1300 HeadRequest also on the redirected URL
1302 def redirect_request(self, req, fp, code, msg, headers, newurl):
1303 if code in (301, 302, 303, 307):
1304 newurl = newurl.replace(' ', '%20')
1305 newheaders = dict((k,v) for k,v in req.headers.items()
1306 if k.lower() not in ("content-length", "content-type"))
1307 return HeadRequest(newurl,
1309 origin_req_host=req.get_origin_req_host(),
1312 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1314 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1316 Fallback to GET if HEAD is not allowed (405 HTTP error)
1318 def http_error_405(self, req, fp, code, msg, headers):
1322 newheaders = dict((k,v) for k,v in req.headers.items()
1323 if k.lower() not in ("content-length", "content-type"))
1324 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1326 origin_req_host=req.get_origin_req_host(),
1330 opener = compat_urllib_request.OpenerDirector()
1331 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1332 HTTPMethodFallback, HEADRedirectHandler,
1333 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1334 opener.add_handler(handler())
1336 response = opener.open(HeadRequest(url))
1337 new_url = response.geturl()
1342 self.report_following_redirect(new_url)
1343 self._downloader.download([new_url])
1346 def _real_extract(self, url):
1347 if self._test_redirect(url): return
1349 video_id = url.split('/')[-1]
1350 request = compat_urllib_request.Request(url)
1352 self.report_download_webpage(video_id)
1353 webpage = compat_urllib_request.urlopen(request).read()
1354 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1355 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1357 except ValueError as err:
1358 # since this is the last-resort InfoExtractor, if
1359 # this error is thrown, it'll be thrown here
1360 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1363 self.report_extraction(video_id)
1364 # Start with something easy: JW Player in SWFObject
1365 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1367 # Broaden the search a little bit
1368 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1370 # Broaden the search a little bit: JWPlayer JS loader
1371 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1373 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1376 # It's possible that one of the regexes
1377 # matched, but returned an empty group:
1378 if mobj.group(1) is None:
1379 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1382 video_url = compat_urllib_parse.unquote(mobj.group(1))
1383 video_id = os.path.basename(video_url)
1385 # here's a fun little line of code for you:
1386 video_extension = os.path.splitext(video_id)[1][1:]
1387 video_id = os.path.splitext(video_id)[0]
1389 # it's tempting to parse this further, but you would
1390 # have to take into account all the variations like
1391 # Video Title - Site Name
1392 # Site Name | Video Title
1393 # Video Title - Tagline | Site Name
1394 # and so on and so forth; it's just not practical
1395 mobj = re.search(r'<title>(.*)</title>', webpage)
1397 self._downloader.trouble(u'ERROR: unable to extract title')
1399 video_title = mobj.group(1)
1401 # video uploader is domain name
1402 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1404 self._downloader.trouble(u'ERROR: unable to extract title')
1406 video_uploader = mobj.group(1)
1411 'uploader': video_uploader,
1412 'upload_date': None,
1413 'title': video_title,
1414 'ext': video_extension,
1418 class YoutubeSearchIE(InfoExtractor):
1419 """Information Extractor for YouTube search queries."""
1420 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1421 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1422 _max_youtube_results = 1000
1423 IE_NAME = u'youtube:search'
1425 def __init__(self, downloader=None):
1426 InfoExtractor.__init__(self, downloader)
1428 def report_download_page(self, query, pagenum):
1429 """Report attempt to download search page with given number."""
1430 query = query.decode(preferredencoding())
1431 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1433 def _real_extract(self, query):
1434 mobj = re.match(self._VALID_URL, query)
1436 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1439 prefix, query = query.split(':')
1441 query = query.encode('utf-8')
1443 self._download_n_results(query, 1)
1445 elif prefix == 'all':
1446 self._download_n_results(query, self._max_youtube_results)
1452 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1454 elif n > self._max_youtube_results:
1455 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1456 n = self._max_youtube_results
1457 self._download_n_results(query, n)
1459 except ValueError: # parsing prefix as integer fails
1460 self._download_n_results(query, 1)
1463 def _download_n_results(self, query, n):
1464 """Downloads a specified number of results for a query"""
1470 while (50 * pagenum) < limit:
1471 self.report_download_page(query, pagenum+1)
1472 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1473 request = compat_urllib_request.Request(result_url)
1475 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
1476 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1477 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1479 api_response = json.loads(data)['data']
1481 new_ids = list(video['id'] for video in api_response['items'])
1482 video_ids += new_ids
1484 limit = min(n, api_response['totalItems'])
1487 if len(video_ids) > n:
1488 video_ids = video_ids[:n]
1489 for id in video_ids:
1490 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1494 class GoogleSearchIE(InfoExtractor):
1495 """Information Extractor for Google Video search queries."""
1496 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1497 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1498 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1499 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1500 _max_google_results = 1000
1501 IE_NAME = u'video.google:search'
1503 def __init__(self, downloader=None):
1504 InfoExtractor.__init__(self, downloader)
1506 def report_download_page(self, query, pagenum):
1507 """Report attempt to download playlist page with given number."""
1508 query = query.decode(preferredencoding())
1509 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1511 def _real_extract(self, query):
1512 mobj = re.match(self._VALID_URL, query)
1514 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1517 prefix, query = query.split(':')
1519 query = query.encode('utf-8')
1521 self._download_n_results(query, 1)
1523 elif prefix == 'all':
1524 self._download_n_results(query, self._max_google_results)
1530 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1532 elif n > self._max_google_results:
1533 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1534 n = self._max_google_results
1535 self._download_n_results(query, n)
1537 except ValueError: # parsing prefix as integer fails
1538 self._download_n_results(query, 1)
1541 def _download_n_results(self, query, n):
1542 """Downloads a specified number of results for a query"""
1548 self.report_download_page(query, pagenum)
1549 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1550 request = compat_urllib_request.Request(result_url)
1552 page = compat_urllib_request.urlopen(request).read()
1553 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1554 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1557 # Extract video identifiers
1558 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1559 video_id = mobj.group(1)
1560 if video_id not in video_ids:
1561 video_ids.append(video_id)
1562 if len(video_ids) == n:
1563 # Specified n videos reached
1564 for id in video_ids:
1565 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1568 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1569 for id in video_ids:
1570 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1573 pagenum = pagenum + 1
1576 class YahooSearchIE(InfoExtractor):
1577 """Information Extractor for Yahoo! Video search queries."""
1580 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1581 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1582 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1583 _MORE_PAGES_INDICATOR = r'\s*Next'
1584 _max_yahoo_results = 1000
1585 IE_NAME = u'video.yahoo:search'
1587 def __init__(self, downloader=None):
1588 InfoExtractor.__init__(self, downloader)
1590 def report_download_page(self, query, pagenum):
1591 """Report attempt to download playlist page with given number."""
1592 query = query.decode(preferredencoding())
1593 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1595 def _real_extract(self, query):
1596 mobj = re.match(self._VALID_URL, query)
1598 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1601 prefix, query = query.split(':')
1603 query = query.encode('utf-8')
1605 self._download_n_results(query, 1)
1607 elif prefix == 'all':
1608 self._download_n_results(query, self._max_yahoo_results)
1614 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1616 elif n > self._max_yahoo_results:
1617 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1618 n = self._max_yahoo_results
1619 self._download_n_results(query, n)
1621 except ValueError: # parsing prefix as integer fails
1622 self._download_n_results(query, 1)
1625 def _download_n_results(self, query, n):
1626 """Downloads a specified number of results for a query"""
1629 already_seen = set()
1633 self.report_download_page(query, pagenum)
1634 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1635 request = compat_urllib_request.Request(result_url)
1637 page = compat_urllib_request.urlopen(request).read()
1638 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1639 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1642 # Extract video identifiers
1643 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1644 video_id = mobj.group(1)
1645 if video_id not in already_seen:
1646 video_ids.append(video_id)
1647 already_seen.add(video_id)
1648 if len(video_ids) == n:
1649 # Specified n videos reached
1650 for id in video_ids:
1651 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1654 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1655 for id in video_ids:
1656 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1659 pagenum = pagenum + 1
1662 class YoutubePlaylistIE(InfoExtractor):
1663 """Information Extractor for YouTube playlists."""
1665 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1666 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1667 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1668 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1669 IE_NAME = u'youtube:playlist'
1671 def __init__(self, downloader=None):
1672 InfoExtractor.__init__(self, downloader)
1674 def report_download_page(self, playlist_id, pagenum):
1675 """Report attempt to download playlist page with given number."""
1676 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1678 def _real_extract(self, url):
1679 # Extract playlist id
1680 mobj = re.match(self._VALID_URL, url)
1682 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1686 if mobj.group(3) is not None:
1687 self._downloader.download([mobj.group(3)])
1690 # Download playlist pages
1691 # prefix is 'p' as default for playlists but there are other types that need extra care
1692 playlist_prefix = mobj.group(1)
1693 if playlist_prefix == 'a':
1694 playlist_access = 'artist'
1696 playlist_prefix = 'p'
1697 playlist_access = 'view_play_list'
1698 playlist_id = mobj.group(2)
1703 self.report_download_page(playlist_id, pagenum)
1704 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1705 request = compat_urllib_request.Request(url)
1707 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1708 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1709 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1712 # Extract video identifiers
1714 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1715 if mobj.group(1) not in ids_in_page:
1716 ids_in_page.append(mobj.group(1))
1717 video_ids.extend(ids_in_page)
1719 if self._MORE_PAGES_INDICATOR not in page:
1721 pagenum = pagenum + 1
1723 total = len(video_ids)
1725 playliststart = self._downloader.params.get('playliststart', 1) - 1
1726 playlistend = self._downloader.params.get('playlistend', -1)
1727 if playlistend == -1:
1728 video_ids = video_ids[playliststart:]
1730 video_ids = video_ids[playliststart:playlistend]
1732 if len(video_ids) == total:
1733 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1735 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1737 for id in video_ids:
1738 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1742 class YoutubeChannelIE(InfoExtractor):
1743 """Information Extractor for YouTube channels."""
1745 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1746 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1747 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1748 IE_NAME = u'youtube:channel'
1750 def report_download_page(self, channel_id, pagenum):
1751 """Report attempt to download channel page with given number."""
1752 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1754 def _real_extract(self, url):
1755 # Extract channel id
1756 mobj = re.match(self._VALID_URL, url)
1758 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1761 # Download channel pages
1762 channel_id = mobj.group(1)
1767 self.report_download_page(channel_id, pagenum)
1768 url = self._TEMPLATE_URL % (channel_id, pagenum)
1769 request = compat_urllib_request.Request(url)
1771 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1772 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1773 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1776 # Extract video identifiers
1778 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1779 if mobj.group(1) not in ids_in_page:
1780 ids_in_page.append(mobj.group(1))
1781 video_ids.extend(ids_in_page)
1783 if self._MORE_PAGES_INDICATOR not in page:
1785 pagenum = pagenum + 1
1787 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1789 for id in video_ids:
1790 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1794 class YoutubeUserIE(InfoExtractor):
1795 """Information Extractor for YouTube users."""
1797 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1798 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1799 _GDATA_PAGE_SIZE = 50
1800 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1801 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1802 IE_NAME = u'youtube:user'
1804 def __init__(self, downloader=None):
1805 InfoExtractor.__init__(self, downloader)
1807 def report_download_page(self, username, start_index):
1808 """Report attempt to download user page."""
1809 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1810 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1812 def _real_extract(self, url):
1814 mobj = re.match(self._VALID_URL, url)
1816 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1819 username = mobj.group(1)
1821 # Download video ids using YouTube Data API. Result size per
1822 # query is limited (currently to 50 videos) so we need to query
1823 # page by page until there are no video ids - it means we got
1830 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1831 self.report_download_page(username, start_index)
1833 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1836 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1837 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1838 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1841 # Extract video identifiers
1844 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1845 if mobj.group(1) not in ids_in_page:
1846 ids_in_page.append(mobj.group(1))
1848 video_ids.extend(ids_in_page)
1850 # A little optimization - if current page is not
1851 # "full", ie. does not contain PAGE_SIZE video ids then
1852 # we can assume that this page is the last one - there
1853 # are no more ids on further pages - no need to query
1856 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1861 all_ids_count = len(video_ids)
1862 playliststart = self._downloader.params.get('playliststart', 1) - 1
1863 playlistend = self._downloader.params.get('playlistend', -1)
1865 if playlistend == -1:
1866 video_ids = video_ids[playliststart:]
1868 video_ids = video_ids[playliststart:playlistend]
1870 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1871 (username, all_ids_count, len(video_ids)))
1873 for video_id in video_ids:
1874 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1877 class BlipTVUserIE(InfoExtractor):
1878 """Information Extractor for blip.tv users."""
1880 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1882 IE_NAME = u'blip.tv:user'
1884 def __init__(self, downloader=None):
1885 InfoExtractor.__init__(self, downloader)
1887 def report_download_page(self, username, pagenum):
1888 """Report attempt to download user page."""
1889 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1890 (self.IE_NAME, username, pagenum))
1892 def _real_extract(self, url):
1894 mobj = re.match(self._VALID_URL, url)
1896 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1899 username = mobj.group(1)
1901 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1903 request = compat_urllib_request.Request(url)
1906 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1907 mobj = re.search(r'data-users-id="([^"]+)"', page)
1908 page_base = page_base % mobj.group(1)
1909 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1910 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1914 # Download video ids using BlipTV Ajax calls. Result size per
1915 # query is limited (currently to 12 videos) so we need to query
1916 # page by page until there are no video ids - it means we got
1923 self.report_download_page(username, pagenum)
1924 url = page_base + "&page=" + str(pagenum)
1925 request = compat_urllib_request.Request( url )
1927 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1928 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1929 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1932 # Extract video identifiers
1935 for mobj in re.finditer(r'href="/([^"]+)"', page):
1936 if mobj.group(1) not in ids_in_page:
1937 ids_in_page.append(unescapeHTML(mobj.group(1)))
1939 video_ids.extend(ids_in_page)
1941 # A little optimization - if current page is not
1942 # "full", ie. does not contain PAGE_SIZE video ids then
1943 # we can assume that this page is the last one - there
1944 # are no more ids on further pages - no need to query
1947 if len(ids_in_page) < self._PAGE_SIZE:
1952 all_ids_count = len(video_ids)
1953 playliststart = self._downloader.params.get('playliststart', 1) - 1
1954 playlistend = self._downloader.params.get('playlistend', -1)
1956 if playlistend == -1:
1957 video_ids = video_ids[playliststart:]
1959 video_ids = video_ids[playliststart:playlistend]
1961 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1962 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1964 for video_id in video_ids:
1965 self._downloader.download([u'http://blip.tv/'+video_id])
1968 class DepositFilesIE(InfoExtractor):
1969 """Information extractor for depositfiles.com"""
1971 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1973 def report_download_webpage(self, file_id):
1974 """Report webpage download."""
1975 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1977 def report_extraction(self, file_id):
1978 """Report information extraction."""
1979 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1981 def _real_extract(self, url):
1982 file_id = url.split('/')[-1]
1983 # Rebuild url in english locale
1984 url = 'http://depositfiles.com/en/files/' + file_id
1986 # Retrieve file webpage with 'Free download' button pressed
1987 free_download_indication = { 'gateway_result' : '1' }
1988 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1990 self.report_download_webpage(file_id)
1991 webpage = compat_urllib_request.urlopen(request).read()
1992 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1993 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1996 # Search for the real file URL
1997 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1998 if (mobj is None) or (mobj.group(1) is None):
1999 # Try to figure out reason of the error.
2000 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
2001 if (mobj is not None) and (mobj.group(1) is not None):
2002 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2003 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2005 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2008 file_url = mobj.group(1)
2009 file_extension = os.path.splitext(file_url)[1][1:]
2011 # Search for file title
2012 mobj = re.search(r'<b title="(.*?)">', webpage)
2014 self._downloader.trouble(u'ERROR: unable to extract title')
2016 file_title = mobj.group(1).decode('utf-8')
2019 'id': file_id.decode('utf-8'),
2020 'url': file_url.decode('utf-8'),
2022 'upload_date': None,
2023 'title': file_title,
2024 'ext': file_extension.decode('utf-8'),
2028 class FacebookIE(InfoExtractor):
2029 """Information Extractor for Facebook"""
2031 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2032 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2033 _NETRC_MACHINE = 'facebook'
2034 IE_NAME = u'facebook'
2036 def report_login(self):
2037 """Report attempt to log in."""
2038 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2040 def _real_initialize(self):
2041 if self._downloader is None:
2046 downloader_params = self._downloader.params
2048 # Attempt to use provided username and password or .netrc data
2049 if downloader_params.get('username', None) is not None:
2050 useremail = downloader_params['username']
2051 password = downloader_params['password']
2052 elif downloader_params.get('usenetrc', False):
2054 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2055 if info is not None:
2059 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2060 except (IOError, netrc.NetrcParseError) as err:
2061 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2064 if useremail is None:
2073 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2076 login_results = compat_urllib_request.urlopen(request).read()
2077 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2078 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2080 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2081 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2084 def _real_extract(self, url):
2085 mobj = re.match(self._VALID_URL, url)
2087 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2089 video_id = mobj.group('ID')
2091 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2092 webpage = self._download_webpage(url, video_id)
2094 BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2095 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2096 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2098 raise ExtractorError(u'Cannot parse data')
2099 data = dict(json.loads(m.group(1)))
2100 params_raw = compat_urllib_parse.unquote(data['params'])
2101 params = json.loads(params_raw)
2102 video_url = params['hd_src']
2104 video_url = params['sd_src']
2106 raise ExtractorError(u'Cannot find video URL')
2107 video_duration = int(params['video_duration'])
2109 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2111 raise ExtractorError(u'Cannot find title in webpage')
2112 video_title = unescapeHTML(m.group(1))
2116 'title': video_title,
2119 'duration': video_duration,
2120 'thumbnail': params['thumbnail_src'],
2125 class BlipTVIE(InfoExtractor):
2126 """Information extractor for blip.tv"""
2128 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2129 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2130 IE_NAME = u'blip.tv'
2132 def report_extraction(self, file_id):
2133 """Report information extraction."""
2134 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2136 def report_direct_download(self, title):
2137 """Report information extraction."""
2138 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2140 def _real_extract(self, url):
2141 mobj = re.match(self._VALID_URL, url)
2143 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2150 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2151 request = compat_urllib_request.Request(json_url)
2152 request.add_header('User-Agent', 'iTunes/10.6.1')
2153 self.report_extraction(mobj.group(1))
2156 urlh = compat_urllib_request.urlopen(request)
2157 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2158 basename = url.split('/')[-1]
2159 title,ext = os.path.splitext(basename)
2160 title = title.decode('UTF-8')
2161 ext = ext.replace('.', '')
2162 self.report_direct_download(title)
2167 'upload_date': None,
2172 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2173 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2174 if info is None: # Regular URL
2176 json_code_bytes = urlh.read()
2177 json_code = json_code_bytes.decode('utf-8')
2178 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2179 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2183 json_data = json.loads(json_code)
2184 if 'Post' in json_data:
2185 data = json_data['Post']
2189 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2190 video_url = data['media']['url']
2191 umobj = re.match(self._URL_EXT, video_url)
2193 raise ValueError('Can not determine filename extension')
2194 ext = umobj.group(1)
2197 'id': data['item_id'],
2199 'uploader': data['display_name'],
2200 'upload_date': upload_date,
2201 'title': data['title'],
2203 'format': data['media']['mimeType'],
2204 'thumbnail': data['thumbnailUrl'],
2205 'description': data['description'],
2206 'player_url': data['embedUrl'],
2207 'user_agent': 'iTunes/10.6.1',
2209 except (ValueError,KeyError) as err:
2210 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2216 class MyVideoIE(InfoExtractor):
2217 """Information Extractor for myvideo.de."""
2219 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2220 IE_NAME = u'myvideo'
2222 def __init__(self, downloader=None):
2223 InfoExtractor.__init__(self, downloader)
2225 def report_extraction(self, video_id):
2226 """Report information extraction."""
2227 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2229 def _real_extract(self,url):
2230 mobj = re.match(self._VALID_URL, url)
2232 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2235 video_id = mobj.group(1)
2238 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2239 webpage = self._download_webpage(webpage_url, video_id)
2241 self.report_extraction(video_id)
2242 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
2245 self._downloader.trouble(u'ERROR: unable to extract media URL')
2247 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2249 mobj = re.search('<title>([^<]+)</title>', webpage)
2251 self._downloader.trouble(u'ERROR: unable to extract title')
2254 video_title = mobj.group(1)
2260 'upload_date': None,
2261 'title': video_title,
2265 class ComedyCentralIE(InfoExtractor):
2266 """Information extractor for The Daily Show and Colbert Report """
2268 # urls can be abbreviations like :thedailyshow or :colbert
2269 # urls for episodes like:
2270 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2271 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2272 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2273 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2274 |(https?://)?(www\.)?
2275 (?P<showname>thedailyshow|colbertnation)\.com/
2276 (full-episodes/(?P<episode>.*)|
2278 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2279 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2282 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2284 _video_extensions = {
2292 _video_dimensions = {
2301 def suitable(self, url):
2302 """Receives a URL and returns True if suitable for this IE."""
2303 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2305 def report_extraction(self, episode_id):
2306 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2308 def report_config_download(self, episode_id, media_id):
2309 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2311 def report_index_download(self, episode_id):
2312 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2314 def _print_formats(self, formats):
2315 print('Available formats:')
2317 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2320 def _real_extract(self, url):
2321 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2323 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2326 if mobj.group('shortname'):
2327 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2328 url = u'http://www.thedailyshow.com/full-episodes/'
2330 url = u'http://www.colbertnation.com/full-episodes/'
2331 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2332 assert mobj is not None
2334 if mobj.group('clip'):
2335 if mobj.group('showname') == 'thedailyshow':
2336 epTitle = mobj.group('tdstitle')
2338 epTitle = mobj.group('cntitle')
2341 dlNewest = not mobj.group('episode')
2343 epTitle = mobj.group('showname')
2345 epTitle = mobj.group('episode')
2347 req = compat_urllib_request.Request(url)
2348 self.report_extraction(epTitle)
2350 htmlHandle = compat_urllib_request.urlopen(req)
2351 html = htmlHandle.read()
2352 webpage = html.decode('utf-8')
2353 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2354 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2357 url = htmlHandle.geturl()
2358 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2360 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2362 if mobj.group('episode') == '':
2363 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2365 epTitle = mobj.group('episode')
2367 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2369 if len(mMovieParams) == 0:
2370 # The Colbert Report embeds the information in a without
2371 # a URL prefix; so extract the alternate reference
2372 # and then add the URL prefix manually.
2374 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2375 if len(altMovieParams) == 0:
2376 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2379 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2381 uri = mMovieParams[0][1]
2382 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2383 self.report_index_download(epTitle)
2385 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2386 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2387 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2392 idoc = xml.etree.ElementTree.fromstring(indexXml)
2393 itemEls = idoc.findall('.//item')
2394 for partNum,itemEl in enumerate(itemEls):
2395 mediaId = itemEl.findall('./guid')[0].text
2396 shortMediaId = mediaId.split(':')[-1]
2397 showId = mediaId.split(':')[-2].replace('.com', '')
2398 officialTitle = itemEl.findall('./title')[0].text
2399 officialDate = itemEl.findall('./pubDate')[0].text
2401 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2402 compat_urllib_parse.urlencode({'uri': mediaId}))
2403 configReq = compat_urllib_request.Request(configUrl)
2404 self.report_config_download(epTitle, shortMediaId)
2406 configXml = compat_urllib_request.urlopen(configReq).read()
2407 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2408 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2411 cdoc = xml.etree.ElementTree.fromstring(configXml)
2413 for rendition in cdoc.findall('.//rendition'):
2414 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2418 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2421 if self._downloader.params.get('listformats', None):
2422 self._print_formats([i[0] for i in turls])
2425 # For now, just pick the highest bitrate
2426 format,rtmp_video_url = turls[-1]
2428 # Get the format arg from the arg stream
2429 req_format = self._downloader.params.get('format', None)
2431 # Select format if we can find one
2434 format, rtmp_video_url = f, v
2437 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2439 raise ExtractorError(u'Cannot transform RTMP url')
2440 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2441 video_url = base + m.group('finalid')
2443 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2448 'upload_date': officialDate,
2453 'description': officialTitle,
2455 results.append(info)
2460 class EscapistIE(InfoExtractor):
2461 """Information extractor for The Escapist """
2463 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2464 IE_NAME = u'escapist'
2466 def report_extraction(self, showName):
2467 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2469 def report_config_download(self, showName):
2470 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2472 def _real_extract(self, url):
2473 mobj = re.match(self._VALID_URL, url)
2475 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2477 showName = mobj.group('showname')
2478 videoId = mobj.group('episode')
2480 self.report_extraction(showName)
2482 webPage = compat_urllib_request.urlopen(url)
2483 webPageBytes = webPage.read()
2484 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2485 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2486 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2487 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2490 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2491 description = unescapeHTML(descMatch.group(1))
2492 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2493 imgUrl = unescapeHTML(imgMatch.group(1))
2494 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2495 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2496 configUrlMatch = re.search('config=(.*)$', playerUrl)
2497 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2499 self.report_config_download(showName)
2501 configJSON = compat_urllib_request.urlopen(configUrl)
2502 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2503 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2504 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2505 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2508 # Technically, it's JavaScript, not JSON
2509 configJSON = configJSON.replace("'", '"')
2512 config = json.loads(configJSON)
2513 except (ValueError,) as err:
2514 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2517 playlist = config['playlist']
2518 videoUrl = playlist[1]['url']
2523 'uploader': showName,
2524 'upload_date': None,
2527 'thumbnail': imgUrl,
2528 'description': description,
2529 'player_url': playerUrl,
2534 class CollegeHumorIE(InfoExtractor):
2535 """Information extractor for collegehumor.com"""
2538 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2539 IE_NAME = u'collegehumor'
2541 def report_manifest(self, video_id):
2542 """Report information extraction."""
2543 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2545 def report_extraction(self, video_id):
2546 """Report information extraction."""
2547 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2549 def _real_extract(self, url):
2550 mobj = re.match(self._VALID_URL, url)
2552 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2554 video_id = mobj.group('videoid')
2559 'upload_date': None,
2562 self.report_extraction(video_id)
2563 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2565 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2566 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2567 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2570 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2572 videoNode = mdoc.findall('./video')[0]
2573 info['description'] = videoNode.findall('./description')[0].text
2574 info['title'] = videoNode.findall('./caption')[0].text
2575 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2576 manifest_url = videoNode.findall('./file')[0].text
2578 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2581 manifest_url += '?hdcore=2.10.3'
2582 self.report_manifest(video_id)
2584 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2585 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2586 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2589 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2591 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2592 node_id = media_node.attrib['url']
2593 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2594 except IndexError as err:
2595 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2598 url_pr = compat_urllib_parse_urlparse(manifest_url)
2599 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2606 class XVideosIE(InfoExtractor):
2607 """Information extractor for xvideos.com"""
2609 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2610 IE_NAME = u'xvideos'
2612 def report_extraction(self, video_id):
2613 """Report information extraction."""
2614 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2616 def _real_extract(self, url):
2617 mobj = re.match(self._VALID_URL, url)
2619 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2621 video_id = mobj.group(1)
2623 webpage = self._download_webpage(url, video_id)
2625 self.report_extraction(video_id)
2629 mobj = re.search(r'flv_url=(.+?)&', webpage)
2631 self._downloader.trouble(u'ERROR: unable to extract video url')
2633 video_url = compat_urllib_parse.unquote(mobj.group(1))
2637 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2639 self._downloader.trouble(u'ERROR: unable to extract video title')
2641 video_title = mobj.group(1)
2644 # Extract video thumbnail
2645 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2647 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2649 video_thumbnail = mobj.group(0)
2655 'upload_date': None,
2656 'title': video_title,
2658 'thumbnail': video_thumbnail,
2659 'description': None,
2665 class SoundcloudIE(InfoExtractor):
2666 """Information extractor for soundcloud.com
2667 To access the media, the uid of the song and a stream token
2668 must be extracted from the page source and the script must make
2669 a request to media.soundcloud.com/crossdomain.xml. Then
2670 the media can be grabbed by requesting from an url composed
2671 of the stream token and uid
2674 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2675 IE_NAME = u'soundcloud'
2677 def __init__(self, downloader=None):
2678 InfoExtractor.__init__(self, downloader)
2680 def report_resolve(self, video_id):
2681 """Report information extraction."""
2682 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2684 def report_extraction(self, video_id):
2685 """Report information extraction."""
2686 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2688 def _real_extract(self, url):
2689 mobj = re.match(self._VALID_URL, url)
2691 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2694 # extract uploader (which is in the url)
2695 uploader = mobj.group(1)
2696 # extract simple title (uploader + slug of song title)
2697 slug_title = mobj.group(2)
2698 simple_title = uploader + u'-' + slug_title
2700 self.report_resolve('%s/%s' % (uploader, slug_title))
2702 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2703 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2704 request = compat_urllib_request.Request(resolv_url)
2706 info_json_bytes = compat_urllib_request.urlopen(request).read()
2707 info_json = info_json_bytes.decode('utf-8')
2708 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2709 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2712 info = json.loads(info_json)
2713 video_id = info['id']
2714 self.report_extraction('%s/%s' % (uploader, slug_title))
2716 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2717 request = compat_urllib_request.Request(streams_url)
2719 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2720 stream_json = stream_json_bytes.decode('utf-8')
2721 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2722 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2725 streams = json.loads(stream_json)
2726 mediaURL = streams['http_mp3_128_url']
2731 'uploader': info['user']['username'],
2732 'upload_date': info['created_at'],
2733 'title': info['title'],
2735 'description': info['description'],
2739 class InfoQIE(InfoExtractor):
2740 """Information extractor for infoq.com"""
2741 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2743 def report_extraction(self, video_id):
2744 """Report information extraction."""
2745 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2747 def _real_extract(self, url):
2748 mobj = re.match(self._VALID_URL, url)
2750 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2753 webpage = self._download_webpage(url, video_id=url)
2754 self.report_extraction(url)
2757 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2759 self._downloader.trouble(u'ERROR: unable to extract video url')
2761 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2762 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2765 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2767 self._downloader.trouble(u'ERROR: unable to extract video title')
2769 video_title = mobj.group(1)
2771 # Extract description
2772 video_description = u'No description available.'
2773 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2774 if mobj is not None:
2775 video_description = mobj.group(1)
2777 video_filename = video_url.split('/')[-1]
2778 video_id, extension = video_filename.split('.')
2784 'upload_date': None,
2785 'title': video_title,
2786 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2788 'description': video_description,
2793 class MixcloudIE(InfoExtractor):
2794 """Information extractor for www.mixcloud.com"""
2796 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2797 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2798 IE_NAME = u'mixcloud'
2800 def __init__(self, downloader=None):
2801 InfoExtractor.__init__(self, downloader)
2803 def report_download_json(self, file_id):
2804 """Report JSON download."""
2805 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2807 def report_extraction(self, file_id):
2808 """Report information extraction."""
2809 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2811 def get_urls(self, jsonData, fmt, bitrate='best'):
2812 """Get urls from 'audio_formats' section in json"""
2815 bitrate_list = jsonData[fmt]
2816 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2817 bitrate = max(bitrate_list) # select highest
2819 url_list = jsonData[fmt][bitrate]
2820 except TypeError: # we have no bitrate info.
2821 url_list = jsonData[fmt]
2824 def check_urls(self, url_list):
2825 """Returns 1st active url from list"""
2826 for url in url_list:
2828 compat_urllib_request.urlopen(url)
2830 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2835 def _print_formats(self, formats):
2836 print('Available formats:')
2837 for fmt in formats.keys():
2838 for b in formats[fmt]:
2840 ext = formats[fmt][b][0]
2841 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2842 except TypeError: # we have no bitrate info
2843 ext = formats[fmt][0]
2844 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2847 def _real_extract(self, url):
2848 mobj = re.match(self._VALID_URL, url)
2850 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2852 # extract uploader & filename from url
2853 uploader = mobj.group(1).decode('utf-8')
2854 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2856 # construct API request
2857 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2858 # retrieve .json file with links to files
2859 request = compat_urllib_request.Request(file_url)
2861 self.report_download_json(file_url)
2862 jsonData = compat_urllib_request.urlopen(request).read()
2863 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2864 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2868 json_data = json.loads(jsonData)
2869 player_url = json_data['player_swf_url']
2870 formats = dict(json_data['audio_formats'])
2872 req_format = self._downloader.params.get('format', None)
2875 if self._downloader.params.get('listformats', None):
2876 self._print_formats(formats)
2879 if req_format is None or req_format == 'best':
2880 for format_param in formats.keys():
2881 url_list = self.get_urls(formats, format_param)
2883 file_url = self.check_urls(url_list)
2884 if file_url is not None:
2887 if req_format not in formats:
2888 self._downloader.trouble(u'ERROR: format is not available')
2891 url_list = self.get_urls(formats, req_format)
2892 file_url = self.check_urls(url_list)
2893 format_param = req_format
2896 'id': file_id.decode('utf-8'),
2897 'url': file_url.decode('utf-8'),
2898 'uploader': uploader.decode('utf-8'),
2899 'upload_date': None,
2900 'title': json_data['name'],
2901 'ext': file_url.split('.')[-1].decode('utf-8'),
2902 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2903 'thumbnail': json_data['thumbnail_url'],
2904 'description': json_data['description'],
2905 'player_url': player_url.decode('utf-8'),
2908 class StanfordOpenClassroomIE(InfoExtractor):
2909 """Information extractor for Stanford's Open ClassRoom"""
2911 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2912 IE_NAME = u'stanfordoc'
2914 def report_download_webpage(self, objid):
2915 """Report information extraction."""
2916 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2918 def report_extraction(self, video_id):
2919 """Report information extraction."""
2920 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2922 def _real_extract(self, url):
2923 mobj = re.match(self._VALID_URL, url)
2925 raise ExtractorError(u'Invalid URL: %s' % url)
2927 if mobj.group('course') and mobj.group('video'): # A specific video
2928 course = mobj.group('course')
2929 video = mobj.group('video')
2931 'id': course + '_' + video,
2933 'upload_date': None,
2936 self.report_extraction(info['id'])
2937 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2938 xmlUrl = baseUrl + video + '.xml'
2940 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2941 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2942 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2944 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2946 info['title'] = mdoc.findall('./title')[0].text
2947 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2949 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2951 info['ext'] = info['url'].rpartition('.')[2]
2953 elif mobj.group('course'): # A course page
2954 course = mobj.group('course')
2959 'upload_date': None,
2962 coursepage = self._download_webpage(url, info['id'],
2963 note='Downloading course info page',
2964 errnote='Unable to download course info page')
2966 m = re.search('<h1>([^<]+)</h1>', coursepage)
2968 info['title'] = unescapeHTML(m.group(1))
2970 info['title'] = info['id']
2972 m = re.search('<description>([^<]+)</description>', coursepage)
2974 info['description'] = unescapeHTML(m.group(1))
2976 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2979 'type': 'reference',
2980 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2984 for entry in info['list']:
2985 assert entry['type'] == 'reference'
2986 results += self.extract(entry['url'])
2990 'id': 'Stanford OpenClassroom',
2993 'upload_date': None,
2996 self.report_download_webpage(info['id'])
2997 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2999 rootpage = compat_urllib_request.urlopen(rootURL).read()
3000 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3001 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3004 info['title'] = info['id']
3006 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3009 'type': 'reference',
3010 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3015 for entry in info['list']:
3016 assert entry['type'] == 'reference'
3017 results += self.extract(entry['url'])
3020 class MTVIE(InfoExtractor):
3021 """Information extractor for MTV.com"""
3023 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3026 def report_extraction(self, video_id):
3027 """Report information extraction."""
3028 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3030 def _real_extract(self, url):
3031 mobj = re.match(self._VALID_URL, url)
3033 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3035 if not mobj.group('proto'):
3036 url = 'http://' + url
3037 video_id = mobj.group('videoid')
3039 webpage = self._download_webpage(url, video_id)
3041 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3043 self._downloader.trouble(u'ERROR: unable to extract song name')
3045 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3046 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3048 self._downloader.trouble(u'ERROR: unable to extract performer')
3050 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3051 video_title = performer + ' - ' + song_name
3053 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3055 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3057 mtvn_uri = mobj.group(1)
3059 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3061 self._downloader.trouble(u'ERROR: unable to extract content id')
3063 content_id = mobj.group(1)
3065 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3066 self.report_extraction(video_id)
3067 request = compat_urllib_request.Request(videogen_url)
3069 metadataXml = compat_urllib_request.urlopen(request).read()
3070 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3071 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3074 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3075 renditions = mdoc.findall('.//rendition')
3077 # For now, always pick the highest quality.
3078 rendition = renditions[-1]
3081 _,_,ext = rendition.attrib['type'].partition('/')
3082 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3083 video_url = rendition.find('./src').text
3085 self._downloader.trouble('Invalid rendition field.')
3091 'uploader': performer,
3092 'upload_date': None,
3093 'title': video_title,
3101 class YoukuIE(InfoExtractor):
3102 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3104 def report_download_webpage(self, file_id):
3105 """Report webpage download."""
3106 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3108 def report_extraction(self, file_id):
3109 """Report information extraction."""
3110 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3113 nowTime = int(time.time() * 1000)
3114 random1 = random.randint(1000,1998)
3115 random2 = random.randint(1000,9999)
3117 return "%d%d%d" %(nowTime,random1,random2)
3119 def _get_file_ID_mix_string(self, seed):
3121 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3123 for i in range(len(source)):
3124 seed = (seed * 211 + 30031 ) % 65536
3125 index = math.floor(seed / 65536 * len(source) )
3126 mixed.append(source[int(index)])
3127 source.remove(source[int(index)])
3128 #return ''.join(mixed)
3131 def _get_file_id(self, fileId, seed):
3132 mixed = self._get_file_ID_mix_string(seed)
3133 ids = fileId.split('*')
3137 realId.append(mixed[int(ch)])
3138 return ''.join(realId)
3140 def _real_extract(self, url):
3141 mobj = re.match(self._VALID_URL, url)
3143 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3145 video_id = mobj.group('ID')
3147 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3149 request = compat_urllib_request.Request(info_url, None, std_headers)
3151 self.report_download_webpage(video_id)
3152 jsondata = compat_urllib_request.urlopen(request).read()
3153 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3154 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3157 self.report_extraction(video_id)
3159 jsonstr = jsondata.decode('utf-8')
3160 config = json.loads(jsonstr)
3162 video_title = config['data'][0]['title']
3163 seed = config['data'][0]['seed']
3165 format = self._downloader.params.get('format', None)
3166 supported_format = list(config['data'][0]['streamfileids'].keys())
3168 if format is None or format == 'best':
3169 if 'hd2' in supported_format:
3174 elif format == 'worst':
3182 fileid = config['data'][0]['streamfileids'][format]
3183 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3184 except (UnicodeDecodeError, ValueError, KeyError):
3185 self._downloader.trouble(u'ERROR: unable to extract info section')
3189 sid = self._gen_sid()
3190 fileid = self._get_file_id(fileid, seed)
3192 #column 8,9 of fileid represent the segment number
3193 #fileid[7:9] should be changed
3194 for index, key in enumerate(keys):
3196 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3197 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3200 'id': '%s_part%02d' % (video_id, index),
3201 'url': download_url,
3203 'upload_date': None,
3204 'title': video_title,
3207 files_info.append(info)
3212 class XNXXIE(InfoExtractor):
3213 """Information extractor for xnxx.com"""
3215 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3217 VIDEO_URL_RE = r'flv_url=(.*?)&'
3218 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3219 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3221 def report_webpage(self, video_id):
3222 """Report information extraction"""
3223 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3225 def report_extraction(self, video_id):
3226 """Report information extraction"""
3227 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3229 def _real_extract(self, url):
3230 mobj = re.match(self._VALID_URL, url)
3232 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3234 video_id = mobj.group(1)
3236 self.report_webpage(video_id)
3238 # Get webpage content
3240 webpage_bytes = compat_urllib_request.urlopen(url).read()
3241 webpage = webpage_bytes.decode('utf-8')
3242 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3243 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3246 result = re.search(self.VIDEO_URL_RE, webpage)
3248 self._downloader.trouble(u'ERROR: unable to extract video url')
3250 video_url = compat_urllib_parse.unquote(result.group(1))
3252 result = re.search(self.VIDEO_TITLE_RE, webpage)
3254 self._downloader.trouble(u'ERROR: unable to extract video title')
3256 video_title = result.group(1)
3258 result = re.search(self.VIDEO_THUMB_RE, webpage)
3260 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3262 video_thumbnail = result.group(1)
3268 'upload_date': None,
3269 'title': video_title,
3271 'thumbnail': video_thumbnail,
3272 'description': None,
3276 class GooglePlusIE(InfoExtractor):
3277 """Information extractor for plus.google.com."""
3279 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3280 IE_NAME = u'plus.google'
3282 def __init__(self, downloader=None):
3283 InfoExtractor.__init__(self, downloader)
3285 def report_extract_entry(self, url):
3286 """Report downloading extry"""
3287 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3289 def report_date(self, upload_date):
3290 """Report downloading extry"""
3291 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3293 def report_uploader(self, uploader):
3294 """Report downloading extry"""
3295 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3297 def report_title(self, video_title):
3298 """Report downloading extry"""
3299 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3301 def report_extract_vid_page(self, video_page):
3302 """Report information extraction."""
3303 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3305 def _real_extract(self, url):
3306 # Extract id from URL
3307 mobj = re.match(self._VALID_URL, url)
3309 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3312 post_url = mobj.group(0)
3313 video_id = mobj.group(1)
3315 video_extension = 'flv'
3317 # Step 1, Retrieve post webpage to extract further information
3318 self.report_extract_entry(post_url)
3319 request = compat_urllib_request.Request(post_url)
3321 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3322 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3323 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3326 # Extract update date
3328 pattern = 'title="Timestamp">(.*?)</a>'
3329 mobj = re.search(pattern, webpage)
3331 upload_date = mobj.group(1)
3332 # Convert timestring to a format suitable for filename
3333 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3334 upload_date = upload_date.strftime('%Y%m%d')
3335 self.report_date(upload_date)
3339 pattern = r'rel\="author".*?>(.*?)</a>'
3340 mobj = re.search(pattern, webpage)
3342 uploader = mobj.group(1)
3343 self.report_uploader(uploader)
3346 # Get the first line for title
3348 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3349 mobj = re.search(pattern, webpage)
3351 video_title = mobj.group(1)
3352 self.report_title(video_title)
3354 # Step 2, Stimulate clicking the image box to launch video
3355 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3356 mobj = re.search(pattern, webpage)
3358 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3360 video_page = mobj.group(1)
3361 request = compat_urllib_request.Request(video_page)
3363 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3364 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3365 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3367 self.report_extract_vid_page(video_page)
3370 # Extract video links on video page
3371 """Extract video links of all sizes"""
3372 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3373 mobj = re.findall(pattern, webpage)
3375 self._downloader.trouble(u'ERROR: unable to extract video links')
3377 # Sort in resolution
3378 links = sorted(mobj)
3380 # Choose the lowest of the sort, i.e. highest resolution
3381 video_url = links[-1]
3382 # Only get the url. The resolution part in the tuple has no use anymore
3383 video_url = video_url[-1]
3384 # Treat escaped \u0026 style hex
3386 video_url = video_url.decode("unicode_escape")
3387 except AttributeError: # Python 3
3388 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3394 'uploader': uploader,
3395 'upload_date': upload_date,
3396 'title': video_title,
3397 'ext': video_extension,
3400 class NBAIE(InfoExtractor):
3401 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3404 def _real_extract(self, url):
3405 mobj = re.match(self._VALID_URL, url)
3407 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3410 video_id = mobj.group(1)
3411 if video_id.endswith('/index.html'):
3412 video_id = video_id[:-len('/index.html')]
3414 webpage = self._download_webpage(url, video_id)
3416 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3417 def _findProp(rexp, default=None):
3418 m = re.search(rexp, webpage)
3420 return unescapeHTML(m.group(1))
3424 shortened_video_id = video_id.rpartition('/')[2]
3425 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3427 'id': shortened_video_id,
3431 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3432 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3436 class JustinTVIE(InfoExtractor):
3437 """Information extractor for justin.tv and twitch.tv"""
3438 # TODO: One broadcast may be split into multiple videos. The key
3439 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3440 # starts at 1 and increases. Can we treat all parts as one video?
3442 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3443 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3444 _JUSTIN_PAGE_LIMIT = 100
3445 IE_NAME = u'justin.tv'
3447 def report_extraction(self, file_id):
3448 """Report information extraction."""
3449 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3451 def report_download_page(self, channel, offset):
3452 """Report attempt to download a single page of videos."""
3453 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3454 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3456 # Return count of items, list of *valid* items
3457 def _parse_page(self, url):
3459 urlh = compat_urllib_request.urlopen(url)
3460 webpage_bytes = urlh.read()
3461 webpage = webpage_bytes.decode('utf-8', 'ignore')
3462 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3463 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3466 response = json.loads(webpage)
3467 if type(response) != list:
3468 error_text = response.get('error', 'unknown error')
3469 self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3472 for clip in response:
3473 video_url = clip['video_file_url']
3475 video_extension = os.path.splitext(video_url)[1][1:]
3476 video_date = re.sub('-', '', clip['start_time'][:10])
3477 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3478 video_id = clip['id']
3479 video_title = clip.get('title', video_id)
3483 'title': video_title,
3484 'uploader': clip.get('channel_name', video_uploader_id),
3485 'uploader_id': video_uploader_id,
3486 'upload_date': video_date,
3487 'ext': video_extension,
3489 return (len(response), info)
3491 def _real_extract(self, url):
3492 mobj = re.match(self._VALID_URL, url)
3494 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3497 api = 'http://api.justin.tv'
3498 video_id = mobj.group(mobj.lastindex)
3500 if mobj.lastindex == 1:
3502 api += '/channel/archives/%s.json'
3504 api += '/broadcast/by_archive/%s.json'
3505 api = api % (video_id,)
3507 self.report_extraction(video_id)
3511 limit = self._JUSTIN_PAGE_LIMIT
3514 self.report_download_page(video_id, offset)
3515 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3516 page_count, page_info = self._parse_page(page_url)
3517 info.extend(page_info)
3518 if not paged or page_count != limit:
3523 class FunnyOrDieIE(InfoExtractor):
3524 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3526 def _real_extract(self, url):
3527 mobj = re.match(self._VALID_URL, url)
3529 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3532 video_id = mobj.group('id')
3533 webpage = self._download_webpage(url, video_id)
3535 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3537 self._downloader.trouble(u'ERROR: unable to find video information')
3538 video_url = unescapeHTML(m.group('url'))
3540 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3542 self._downloader.trouble(u'Cannot find video title')
3543 title = unescapeHTML(m.group('title'))
3545 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3547 desc = unescapeHTML(m.group('desc'))
3556 'description': desc,
3560 class TweetReelIE(InfoExtractor):
3561 _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3563 def _real_extract(self, url):
3564 mobj = re.match(self._VALID_URL, url)
3566 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3569 video_id = mobj.group('id')
3570 webpage = self._download_webpage(url, video_id)
3572 m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3574 self._downloader.trouble(u'ERROR: Cannot find status ID')
3575 status_id = m.group(1)
3577 m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3579 self._downloader.trouble(u'WARNING: Cannot find description')
3580 desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3582 m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3584 self._downloader.trouble(u'ERROR: Cannot find uploader')
3585 uploader = unescapeHTML(m.group('uploader'))
3586 uploader_id = unescapeHTML(m.group('uploader_id'))
3588 m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3590 self._downloader.trouble(u'ERROR: Cannot find upload date')
3591 upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3594 video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3601 'description': desc,
3602 'uploader': uploader,
3603 'uploader_id': uploader_id,
3604 'internal_id': status_id,
3605 'upload_date': upload_date
3609 class SteamIE(InfoExtractor):
3610 _VALID_URL = r"""http://store.steampowered.com/
3611 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3613 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3616 def suitable(self, url):
3617 """Receives a URL and returns True if suitable for this IE."""
3618 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3620 def _real_extract(self, url):
3621 m = re.match(self._VALID_URL, url, re.VERBOSE)
3622 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3623 gameID = m.group('gameID')
3624 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3625 webpage = self._download_webpage(videourl, gameID)
3626 mweb = re.finditer(urlRE, webpage)
3627 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3628 titles = re.finditer(namesRE, webpage)
3629 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
3630 thumbs = re.finditer(thumbsRE, webpage)
3632 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
3633 video_id = vid.group('videoID')
3634 title = vtitle.group('videoName')
3635 video_url = vid.group('videoURL')
3636 video_thumb = thumb.group('thumbnail')
3638 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3643 'title': unescapeHTML(title),
3644 'thumbnail': video_thumb
3649 class UstreamIE(InfoExtractor):
3650 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3651 IE_NAME = u'ustream'
3653 def _real_extract(self, url):
3654 m = re.match(self._VALID_URL, url)
3655 video_id = m.group('videoID')
3656 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3657 webpage = self._download_webpage(url, video_id)
3658 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3659 title = m.group('title')
3660 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3661 uploader = m.group('uploader')
3667 'uploader': uploader
3671 class RBMARadioIE(InfoExtractor):
3672 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3674 def _real_extract(self, url):
3675 m = re.match(self._VALID_URL, url)
3676 video_id = m.group('videoID')
3678 webpage = self._download_webpage(url, video_id)
3679 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3681 raise ExtractorError(u'Cannot find metadata')
3682 json_data = m.group(1)
3685 data = json.loads(json_data)
3686 except ValueError as e:
3687 raise ExtractorError(u'Invalid JSON: ' + str(e))
3689 video_url = data['akamai_url'] + '&cbr=256'
3690 url_parts = compat_urllib_parse_urlparse(video_url)
3691 video_ext = url_parts.path.rpartition('.')[2]
3696 'title': data['title'],
3697 'description': data.get('teaser_text'),
3698 'location': data.get('country_of_origin'),
3699 'uploader': data.get('host', {}).get('name'),
3700 'uploader_id': data.get('host', {}).get('slug'),
3701 'thumbnail': data.get('image', {}).get('large_url_2x'),
3702 'duration': data.get('duration'),
3707 class YouPornIE(InfoExtractor):
3708 """Information extractor for youporn.com."""
3709 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3711 def _print_formats(self, formats):
3712 """Print all available formats"""
3713 print(u'Available formats:')
3714 print(u'ext\t\tformat')
3715 print(u'---------------------------------')
3716 for format in formats:
3717 print(u'%s\t\t%s' % (format['ext'], format['format']))
3719 def _specific(self, req_format, formats):
3721 if(x["format"]==req_format):
3725 def _real_extract(self, url):
3726 mobj = re.match(self._VALID_URL, url)
3728 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3731 video_id = mobj.group('videoid')
3733 req = compat_urllib_request.Request(url)
3734 req.add_header('Cookie', 'age_verified=1')
3735 webpage = self._download_webpage(req, video_id)
3737 # Get the video title
3738 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3740 raise ExtractorError(u'Unable to extract video title')
3741 video_title = result.group('title').strip()
3743 # Get the video date
3744 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3746 self._downloader.to_stderr(u'WARNING: unable to extract video date')
3749 upload_date = result.group('date').strip()
3751 # Get the video uploader
3752 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3754 self._downloader.to_stderr(u'WARNING: unable to extract uploader')
3755 video_uploader = None
3757 video_uploader = result.group('uploader').strip()
3758 video_uploader = clean_html( video_uploader )
3760 # Get all of the formats available
3761 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3762 result = re.search(DOWNLOAD_LIST_RE, webpage)
3764 raise ExtractorError(u'Unable to extract download list')
3765 download_list_html = result.group('download_list').strip()
3767 # Get all of the links from the page
3768 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3769 links = re.findall(LINK_RE, download_list_html)
3770 if(len(links) == 0):
3771 raise ExtractorError(u'ERROR: no known formats available for video')
3773 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3778 # A link looks like this:
3779 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3780 # A path looks like this:
3781 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3782 video_url = unescapeHTML( link )
3783 path = compat_urllib_parse_urlparse( video_url ).path
3784 extension = os.path.splitext( path )[1][1:]
3785 format = path.split('/')[4].split('_')[:2]
3788 format = "-".join( format )
3789 title = u'%s-%s-%s' % (video_title, size, bitrate)
3794 'uploader': video_uploader,
3795 'upload_date': upload_date,
3800 'description': None,
3804 if self._downloader.params.get('listformats', None):
3805 self._print_formats(formats)
3808 req_format = self._downloader.params.get('format', None)
3809 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3811 if req_format is None or req_format == 'best':
3813 elif req_format == 'worst':
3814 return [formats[-1]]
3815 elif req_format in ('-1', 'all'):
3818 format = self._specific( req_format, formats )
3820 self._downloader.trouble(u'ERROR: requested format not available')
3826 class PornotubeIE(InfoExtractor):
3827 """Information extractor for pornotube.com."""
3828 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3830 def _real_extract(self, url):
3831 mobj = re.match(self._VALID_URL, url)
3833 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3836 video_id = mobj.group('videoid')
3837 video_title = mobj.group('title')
3839 # Get webpage content
3840 webpage = self._download_webpage(url, video_id)
3843 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3844 result = re.search(VIDEO_URL_RE, webpage)
3846 self._downloader.trouble(u'ERROR: unable to extract video url')
3848 video_url = compat_urllib_parse.unquote(result.group('url'))
3850 #Get the uploaded date
3851 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3852 result = re.search(VIDEO_UPLOADED_RE, webpage)
3854 self._downloader.trouble(u'ERROR: unable to extract video title')
3856 upload_date = result.group('date')
3858 info = {'id': video_id,
3861 'upload_date': upload_date,
3862 'title': video_title,
3868 class YouJizzIE(InfoExtractor):
3869 """Information extractor for youjizz.com."""
3870 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3872 def _real_extract(self, url):
3873 mobj = re.match(self._VALID_URL, url)
3875 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3878 video_id = mobj.group('videoid')
3880 # Get webpage content
3881 webpage = self._download_webpage(url, video_id)
3883 # Get the video title
3884 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3886 raise ExtractorError(u'ERROR: unable to extract video title')
3887 video_title = result.group('title').strip()
3889 # Get the embed page
3890 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3892 raise ExtractorError(u'ERROR: unable to extract embed page')
3894 embed_page_url = result.group(0).strip()
3895 video_id = result.group('videoid')
3897 webpage = self._download_webpage(embed_page_url, video_id)
3900 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3902 raise ExtractorError(u'ERROR: unable to extract video url')
3903 video_url = result.group('source')
3905 info = {'id': video_id,
3907 'title': video_title,
3910 'player_url': embed_page_url}
3914 class EightTracksIE(InfoExtractor):
3916 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3918 def _real_extract(self, url):
3919 mobj = re.match(self._VALID_URL, url)
3921 raise ExtractorError(u'Invalid URL: %s' % url)
3922 playlist_id = mobj.group('id')
3924 webpage = self._download_webpage(url, playlist_id)
3926 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3928 raise ExtractorError(u'Cannot find trax information')
3929 json_like = m.group(1)
3930 data = json.loads(json_like)
3932 session = str(random.randint(0, 1000000000))
3934 track_count = data['tracks_count']
3935 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3936 next_url = first_url
3938 for i in itertools.count():
3939 api_json = self._download_webpage(next_url, playlist_id,
3940 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3941 errnote=u'Failed to download song information')
3942 api_data = json.loads(api_json)
3943 track_data = api_data[u'set']['track']
3945 'id': track_data['id'],
3946 'url': track_data['track_file_stream_url'],
3947 'title': track_data['performer'] + u' - ' + track_data['name'],
3948 'raw_title': track_data['name'],
3949 'uploader_id': data['user']['login'],
3953 if api_data['set']['at_last_track']:
3955 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3958 class KeekIE(InfoExtractor):
3959 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3962 def _real_extract(self, url):
3963 m = re.match(self._VALID_URL, url)
3964 video_id = m.group('videoID')
3965 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3966 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3967 webpage = self._download_webpage(url, video_id)
3968 m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
3969 title = unescapeHTML(m.group('title'))
3970 m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
3971 uploader = unescapeHTML(m.group('uploader'))
3977 'thumbnail': thumbnail,
3978 'uploader': uploader
3982 class TEDIE(InfoExtractor):
3983 _VALID_URL=r'''http://www.ted.com/
3985 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3987 ((?P<type_talk>talks)) # We have a simple talk
3989 /(?P<name>\w+) # Here goes the name and then ".html"
3992 def suitable(self, url):
3993 """Receives a URL and returns True if suitable for this IE."""
3994 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3996 def _real_extract(self, url):
3997 m=re.match(self._VALID_URL, url, re.VERBOSE)
3998 if m.group('type_talk'):
3999 return [self._talk_info(url)]
4001 playlist_id=m.group('playlist_id')
4002 name=m.group('name')
4003 self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
4004 return self._playlist_videos_info(url,name,playlist_id)
4006 def _talk_video_link(self,mediaSlug):
4007 '''Returns the video link for that mediaSlug'''
4008 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4010 def _playlist_videos_info(self,url,name,playlist_id=0):
4011 '''Returns the videos of the playlist'''
4013 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4014 ([.\s]*?)data-playlist_item_id="(\d+)"
4015 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4017 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
4018 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4019 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4020 m_names=re.finditer(video_name_RE,webpage)
4022 for m_video, m_name in zip(m_videos,m_names):
4023 video_id=m_video.group('video_id')
4024 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
4025 info.append(self._talk_info(talk_url,video_id))
4028 def _talk_info(self, url, video_id=0):
4029 """Return the video for the talk in the url"""
4030 m=re.match(self._VALID_URL, url,re.VERBOSE)
4031 videoName=m.group('name')
4032 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4033 # If the url includes the language we get the title translated
4034 title_RE=r'<h1><span id="altHeadline" >(?P<title>.*)</span></h1>'
4035 title=re.search(title_RE, webpage).group('title')
4036 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4037 "id":(?P<videoID>[\d]+).*?
4038 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4039 thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"'
4040 thumb_match=re.search(thumb_RE,webpage)
4041 info_match=re.search(info_RE,webpage,re.VERBOSE)
4042 video_id=info_match.group('videoID')
4043 mediaSlug=info_match.group('mediaSlug')
4044 video_url=self._talk_video_link(mediaSlug)
4050 'thumbnail': thumb_match.group('thumbnail')
4054 class MySpassIE(InfoExtractor):
4055 _VALID_URL = r'http://www.myspass.de/.*'
4057 def _real_extract(self, url):
4058 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4060 # video id is the last path element of the URL
4061 # usually there is a trailing slash, so also try the second but last
4062 url_path = compat_urllib_parse_urlparse(url).path
4063 url_parent_path, video_id = os.path.split(url_path)
4065 _, video_id = os.path.split(url_parent_path)
4068 metadata_url = META_DATA_URL_TEMPLATE % video_id
4069 metadata_text = self._download_webpage(metadata_url, video_id)
4070 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4072 # extract values from metadata
4073 url_flv_el = metadata.find('url_flv')
4074 if url_flv_el is None:
4075 self._downloader.trouble(u'ERROR: unable to extract download url')
4077 video_url = url_flv_el.text
4078 extension = os.path.splitext(video_url)[1][1:]
4079 title_el = metadata.find('title')
4080 if title_el is None:
4081 self._downloader.trouble(u'ERROR: unable to extract title')
4083 title = title_el.text
4084 format_id_el = metadata.find('format_id')
4085 if format_id_el is None:
4088 format = format_id_el.text
4089 description_el = metadata.find('description')
4090 if description_el is not None:
4091 description = description_el.text
4094 imagePreview_el = metadata.find('imagePreview')
4095 if imagePreview_el is not None:
4096 thumbnail = imagePreview_el.text
4105 'thumbnail': thumbnail,
4106 'description': description
4110 def gen_extractors():
4111 """ Return a list of an instance of every supported extractor.
4112 The order does matter; the first extractor matched is the one handling the URL.
4115 YoutubePlaylistIE(),
4139 StanfordOpenClassroomIE(),