2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
22 class InfoExtractor(object):
23 """Information Extractor class.
25 Information extractors are the classes that, given a URL, extract
26 information about the video (or videos) the URL refers to. This
27 information includes the real video URL, the video title, author and
28 others. The information is stored in a dictionary which is then
29 passed to the FileDownloader. The FileDownloader processes this
30 information possibly downloading the video to the file system, among
31 other possible outcomes.
33 The dictionaries must include the following fields:
37 title: Video title, unescaped.
38 ext: Video filename extension.
40 The following fields are optional:
42 format: The video format, defaults to ext (used for --get-format)
43 thumbnail: Full URL to a video thumbnail image.
44 description: One-line video description.
45 uploader: Full name of the video uploader.
46 upload_date: Video upload date (YYYYMMDD).
47 uploader_id: Nickname or id of the video uploader.
48 location: Physical location of the video.
49 player_url: SWF Player URL (used for rtmpdump).
50 subtitles: The .srt file contents.
51 urlhandle: [internal] The urlHandle to be used to download the file,
52 like returned by urllib.request.urlopen
54 The fields should all be Unicode strings.
56 Subclasses of this one should re-define the _real_initialize() and
57 _real_extract() methods and define a _VALID_URL regexp.
58 Probably, they should also be added to the list of extractors.
60 _real_extract() must return a *list* of information dictionaries as
63 Finally, the _WORKING attribute should be set to False for broken IEs
64 in order to warn the users and skip the tests.
71 def __init__(self, downloader=None):
72 """Constructor. Receives an optional downloader."""
74 self.set_downloader(downloader)
76 def suitable(self, url):
77 """Receives a URL and returns True if suitable for this IE."""
78 return re.match(self._VALID_URL, url) is not None
81 """Getter method for _WORKING."""
85 """Initializes an instance (authentication, etc)."""
87 self._real_initialize()
90 def extract(self, url):
91 """Extracts URL information and returns it in list of dicts."""
93 return self._real_extract(url)
95 def set_downloader(self, downloader):
96 """Sets the downloader for this IE."""
97 self._downloader = downloader
99 def _real_initialize(self):
100 """Real initialization process. Redefine in subclasses."""
103 def _real_extract(self, url):
104 """Real extraction process. Redefine in subclasses."""
109 return type(self).__name__[:-2]
111 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
112 """ Returns the response handle """
114 note = u'Downloading video webpage'
115 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
117 return compat_urllib_request.urlopen(url_or_request)
118 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
120 errnote = u'Unable to download webpage'
121 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
123 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
124 """ Returns the data of the page as a string """
125 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
126 webpage_bytes = urlh.read()
127 return webpage_bytes.decode('utf-8', 'replace')
130 class YoutubeIE(InfoExtractor):
131 """Information extractor for youtube.com."""
135 (?:https?://)? # http(s):// (optional)
136 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
137 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
138 (?:.*?\#/)? # handle anchor (#/) redirect urls
139 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
140 (?: # the various things that can precede the ID:
141 (?:(?:v|embed|e)/) # v/ or embed/ or e/
142 |(?: # or the v= param in all its forms
143 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
144 (?:\?|\#!?) # the params delimiter ? or # or #!
145 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
148 )? # optional -> youtube.com/xxxx is OK
149 )? # all until now is optional -> you can pass the naked ID
150 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
151 (?(1).+)? # if we found the ID, everything can follow
153 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
154 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
155 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
156 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
157 _NETRC_MACHINE = 'youtube'
158 # Listed in order of quality
159 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
160 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
161 _video_extensions = {
167 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
173 _video_dimensions = {
191 def suitable(self, url):
192 """Receives a URL and returns True if suitable for this IE."""
193 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
195 def report_lang(self):
196 """Report attempt to set language."""
197 self._downloader.to_screen(u'[youtube] Setting language')
199 def report_login(self):
200 """Report attempt to log in."""
201 self._downloader.to_screen(u'[youtube] Logging in')
203 def report_age_confirmation(self):
204 """Report attempt to confirm age."""
205 self._downloader.to_screen(u'[youtube] Confirming age')
207 def report_video_webpage_download(self, video_id):
208 """Report attempt to download video webpage."""
209 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
211 def report_video_info_webpage_download(self, video_id):
212 """Report attempt to download video info webpage."""
213 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
215 def report_video_subtitles_download(self, video_id):
216 """Report attempt to download video info webpage."""
217 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
219 def report_information_extraction(self, video_id):
220 """Report attempt to extract video information."""
221 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
223 def report_unavailable_format(self, video_id, format):
224 """Report extracted video URL."""
225 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
227 def report_rtmp_download(self):
228 """Indicate the download will use the RTMP protocol."""
229 self._downloader.to_screen(u'[youtube] RTMP download detected')
231 def _closed_captions_xml_to_srt(self, xml_string):
233 texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
234 # TODO parse xml instead of regex
235 for n, (start, dur_tag, dur, caption) in enumerate(texts):
236 if not dur: dur = '4'
238 end = start + float(dur)
239 start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
240 end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
241 caption = unescapeHTML(caption)
242 caption = unescapeHTML(caption) # double cycle, intentional
243 srt += str(n+1) + '\n'
244 srt += start + ' --> ' + end + '\n'
245 srt += caption + '\n\n'
248 def _extract_subtitles(self, video_id):
249 self.report_video_subtitles_download(video_id)
250 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
252 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
253 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
254 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
255 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
256 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
257 if not srt_lang_list:
258 return (u'WARNING: video has no closed captions', None)
259 if self._downloader.params.get('subtitleslang', False):
260 srt_lang = self._downloader.params.get('subtitleslang')
261 elif 'en' in srt_lang_list:
264 srt_lang = list(srt_lang_list.keys())[0]
265 if not srt_lang in srt_lang_list:
266 return (u'WARNING: no closed captions found in the specified language', None)
267 params = compat_urllib_parse.urlencode({
269 'name': srt_lang_list[srt_lang].encode('utf-8'),
272 url = 'http://www.youtube.com/api/timedtext?' + params
274 srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8')
275 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
276 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
278 return (u'WARNING: Did not fetch video subtitles', None)
279 return (None, self._closed_captions_xml_to_srt(srt_xml))
281 def _print_formats(self, formats):
282 print('Available formats:')
284 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
286 def _real_initialize(self):
287 if self._downloader is None:
292 downloader_params = self._downloader.params
294 # Attempt to use provided username and password or .netrc data
295 if downloader_params.get('username', None) is not None:
296 username = downloader_params['username']
297 password = downloader_params['password']
298 elif downloader_params.get('usenetrc', False):
300 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
305 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
306 except (IOError, netrc.NetrcParseError) as err:
307 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
311 request = compat_urllib_request.Request(self._LANG_URL)
314 compat_urllib_request.urlopen(request).read()
315 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
316 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
319 # No authentication to be performed
323 request = compat_urllib_request.Request(self._LOGIN_URL)
325 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
326 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
327 self._downloader.to_stderr(u'WARNING: unable to fetch login page: %s' % compat_str(err))
332 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
334 galx = match.group(1)
336 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
342 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
346 u'PersistentCookie': u'yes',
348 u'bgresponse': u'js_disabled',
349 u'checkConnection': u'',
350 u'checkedDomains': u'youtube',
356 u'signIn': u'Sign in',
358 u'service': u'youtube',
362 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
364 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
365 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
366 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
369 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
370 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
371 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
373 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
374 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
380 'action_confirm': 'Confirm',
382 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
384 self.report_age_confirmation()
385 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
386 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
387 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
390 def _extract_id(self, url):
391 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
393 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
395 video_id = mobj.group(2)
398 def _real_extract(self, url):
399 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
400 mobj = re.search(self._NEXT_URL_RE, url)
402 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
403 video_id = self._extract_id(url)
406 self.report_video_webpage_download(video_id)
407 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
408 request = compat_urllib_request.Request(url)
410 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
411 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
412 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
415 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
417 # Attempt to extract SWF player URL
418 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
420 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
425 self.report_video_info_webpage_download(video_id)
426 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
427 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
428 % (video_id, el_type))
429 request = compat_urllib_request.Request(video_info_url)
431 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
432 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
433 video_info = compat_parse_qs(video_info_webpage)
434 if 'token' in video_info:
436 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
437 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
439 if 'token' not in video_info:
440 if 'reason' in video_info:
441 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
443 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
446 # Check for "rental" videos
447 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
448 self._downloader.trouble(u'ERROR: "rental" videos not supported')
451 # Start extracting information
452 self.report_information_extraction(video_id)
455 if 'author' not in video_info:
456 self._downloader.trouble(u'ERROR: unable to extract uploader name')
458 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
461 video_uploader_id = None
462 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
464 video_uploader_id = mobj.group(1)
466 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
469 if 'title' not in video_info:
470 self._downloader.trouble(u'ERROR: unable to extract video title')
472 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
475 if 'thumbnail_url' not in video_info:
476 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
478 else: # don't panic if we can't find it
479 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
483 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
485 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
486 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
487 for expression in format_expressions:
489 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
494 video_description = get_element_by_id("eow-description", video_webpage)
495 if video_description:
496 video_description = clean_html(video_description)
498 video_description = ''
501 video_subtitles = None
502 if self._downloader.params.get('writesubtitles', False):
503 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
505 self._downloader.trouble(srt_error)
507 if 'length_seconds' not in video_info:
508 self._downloader.trouble(u'WARNING: unable to extract video duration')
511 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
514 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
516 # Decide which formats to download
517 req_format = self._downloader.params.get('format', None)
519 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
520 self.report_rtmp_download()
521 video_url_list = [(None, video_info['conn'][0])]
522 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
523 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
524 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
525 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
526 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
528 format_limit = self._downloader.params.get('format_limit', None)
529 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
530 if format_limit is not None and format_limit in available_formats:
531 format_list = available_formats[available_formats.index(format_limit):]
533 format_list = available_formats
534 existing_formats = [x for x in format_list if x in url_map]
535 if len(existing_formats) == 0:
536 self._downloader.trouble(u'ERROR: no known formats available for video')
538 if self._downloader.params.get('listformats', None):
539 self._print_formats(existing_formats)
541 if req_format is None or req_format == 'best':
542 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
543 elif req_format == 'worst':
544 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
545 elif req_format in ('-1', 'all'):
546 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
548 # Specific formats. We pick the first in a slash-delimeted sequence.
549 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
550 req_formats = req_format.split('/')
551 video_url_list = None
552 for rf in req_formats:
554 video_url_list = [(rf, url_map[rf])]
556 if video_url_list is None:
557 self._downloader.trouble(u'ERROR: requested format not available')
560 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
564 for format_param, video_real_url in video_url_list:
566 video_extension = self._video_extensions.get(format_param, 'flv')
568 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
569 self._video_dimensions.get(format_param, '???'))
573 'url': video_real_url,
574 'uploader': video_uploader,
575 'uploader_id': video_uploader_id,
576 'upload_date': upload_date,
577 'title': video_title,
578 'ext': video_extension,
579 'format': video_format,
580 'thumbnail': video_thumbnail,
581 'description': video_description,
582 'player_url': player_url,
583 'subtitles': video_subtitles,
584 'duration': video_duration
589 class MetacafeIE(InfoExtractor):
590 """Information Extractor for metacafe.com."""
592 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
593 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
594 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
595 IE_NAME = u'metacafe'
597 def __init__(self, downloader=None):
598 InfoExtractor.__init__(self, downloader)
600 def report_disclaimer(self):
601 """Report disclaimer retrieval."""
602 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
604 def report_age_confirmation(self):
605 """Report attempt to confirm age."""
606 self._downloader.to_screen(u'[metacafe] Confirming age')
608 def report_download_webpage(self, video_id):
609 """Report webpage download."""
610 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
612 def report_extraction(self, video_id):
613 """Report information extraction."""
614 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
616 def _real_initialize(self):
617 # Retrieve disclaimer
618 request = compat_urllib_request.Request(self._DISCLAIMER)
620 self.report_disclaimer()
621 disclaimer = compat_urllib_request.urlopen(request).read()
622 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
623 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
629 'submit': "Continue - I'm over 18",
631 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
633 self.report_age_confirmation()
634 disclaimer = compat_urllib_request.urlopen(request).read()
635 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
636 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
639 def _real_extract(self, url):
640 # Extract id and simplified title from URL
641 mobj = re.match(self._VALID_URL, url)
643 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
646 video_id = mobj.group(1)
648 # Check if video comes from YouTube
649 mobj2 = re.match(r'^yt-(.*)$', video_id)
650 if mobj2 is not None:
651 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
654 # Retrieve video webpage to extract further information
655 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
657 self.report_download_webpage(video_id)
658 webpage = compat_urllib_request.urlopen(request).read()
659 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
660 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
663 # Extract URL, uploader and title from webpage
664 self.report_extraction(video_id)
665 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
667 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
668 video_extension = mediaURL[-3:]
670 # Extract gdaKey if available
671 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
675 gdaKey = mobj.group(1)
676 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
678 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
680 self._downloader.trouble(u'ERROR: unable to extract media URL')
682 vardict = compat_parse_qs(mobj.group(1))
683 if 'mediaData' not in vardict:
684 self._downloader.trouble(u'ERROR: unable to extract media URL')
686 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
688 self._downloader.trouble(u'ERROR: unable to extract media URL')
690 mediaURL = mobj.group(1).replace('\\/', '/')
691 video_extension = mediaURL[-3:]
692 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
694 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
696 self._downloader.trouble(u'ERROR: unable to extract title')
698 video_title = mobj.group(1).decode('utf-8')
700 mobj = re.search(r'submitter=(.*?);', webpage)
702 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
704 video_uploader = mobj.group(1)
707 'id': video_id.decode('utf-8'),
708 'url': video_url.decode('utf-8'),
709 'uploader': video_uploader.decode('utf-8'),
711 'title': video_title,
712 'ext': video_extension.decode('utf-8'),
716 class DailymotionIE(InfoExtractor):
717 """Information Extractor for Dailymotion"""
719 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
720 IE_NAME = u'dailymotion'
723 def __init__(self, downloader=None):
724 InfoExtractor.__init__(self, downloader)
726 def report_extraction(self, video_id):
727 """Report information extraction."""
728 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
730 def _real_extract(self, url):
731 # Extract id and simplified title from URL
732 mobj = re.match(self._VALID_URL, url)
734 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
737 video_id = mobj.group(1).split('_')[0].split('?')[0]
739 video_extension = 'mp4'
741 # Retrieve video webpage to extract further information
742 request = compat_urllib_request.Request(url)
743 request.add_header('Cookie', 'family_filter=off')
744 webpage = self._download_webpage(request, video_id)
746 # Extract URL, uploader and title from webpage
747 self.report_extraction(video_id)
748 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
750 self._downloader.trouble(u'ERROR: unable to extract media URL')
752 flashvars = compat_urllib_parse.unquote(mobj.group(1))
754 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
757 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
760 self._downloader.trouble(u'ERROR: unable to extract video URL')
763 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
765 self._downloader.trouble(u'ERROR: unable to extract video URL')
768 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
770 # TODO: support choosing qualities
772 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
774 self._downloader.trouble(u'ERROR: unable to extract title')
776 video_title = unescapeHTML(mobj.group('title'))
778 video_uploader = None
779 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
781 # lookin for official user
782 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
783 if mobj_official is None:
784 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
786 video_uploader = mobj_official.group(1)
788 video_uploader = mobj.group(1)
790 video_upload_date = None
791 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
793 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
798 'uploader': video_uploader,
799 'upload_date': video_upload_date,
800 'title': video_title,
801 'ext': video_extension,
805 class PhotobucketIE(InfoExtractor):
806 """Information extractor for photobucket.com."""
808 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
809 IE_NAME = u'photobucket'
811 def __init__(self, downloader=None):
812 InfoExtractor.__init__(self, downloader)
814 def report_download_webpage(self, video_id):
815 """Report webpage download."""
816 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
818 def report_extraction(self, video_id):
819 """Report information extraction."""
820 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
822 def _real_extract(self, url):
823 # Extract id from URL
824 mobj = re.match(self._VALID_URL, url)
826 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
829 video_id = mobj.group(1)
831 video_extension = 'flv'
833 # Retrieve video webpage to extract further information
834 request = compat_urllib_request.Request(url)
836 self.report_download_webpage(video_id)
837 webpage = compat_urllib_request.urlopen(request).read()
838 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
839 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
842 # Extract URL, uploader, and title from webpage
843 self.report_extraction(video_id)
844 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
846 self._downloader.trouble(u'ERROR: unable to extract media URL')
848 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
852 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
854 self._downloader.trouble(u'ERROR: unable to extract title')
856 video_title = mobj.group(1).decode('utf-8')
858 video_uploader = mobj.group(2).decode('utf-8')
861 'id': video_id.decode('utf-8'),
862 'url': video_url.decode('utf-8'),
863 'uploader': video_uploader,
865 'title': video_title,
866 'ext': video_extension.decode('utf-8'),
870 class YahooIE(InfoExtractor):
871 """Information extractor for video.yahoo.com."""
874 # _VALID_URL matches all Yahoo! Video URLs
875 # _VPAGE_URL matches only the extractable '/watch/' URLs
876 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
877 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
878 IE_NAME = u'video.yahoo'
880 def __init__(self, downloader=None):
881 InfoExtractor.__init__(self, downloader)
883 def report_download_webpage(self, video_id):
884 """Report webpage download."""
885 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
887 def report_extraction(self, video_id):
888 """Report information extraction."""
889 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
891 def _real_extract(self, url, new_video=True):
892 # Extract ID from URL
893 mobj = re.match(self._VALID_URL, url)
895 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
898 video_id = mobj.group(2)
899 video_extension = 'flv'
901 # Rewrite valid but non-extractable URLs as
902 # extractable English language /watch/ URLs
903 if re.match(self._VPAGE_URL, url) is None:
904 request = compat_urllib_request.Request(url)
906 webpage = compat_urllib_request.urlopen(request).read()
907 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
908 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
911 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
913 self._downloader.trouble(u'ERROR: Unable to extract id field')
915 yahoo_id = mobj.group(1)
917 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
919 self._downloader.trouble(u'ERROR: Unable to extract vid field')
921 yahoo_vid = mobj.group(1)
923 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
924 return self._real_extract(url, new_video=False)
926 # Retrieve video webpage to extract further information
927 request = compat_urllib_request.Request(url)
929 self.report_download_webpage(video_id)
930 webpage = compat_urllib_request.urlopen(request).read()
931 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
932 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
935 # Extract uploader and title from webpage
936 self.report_extraction(video_id)
937 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
939 self._downloader.trouble(u'ERROR: unable to extract video title')
941 video_title = mobj.group(1).decode('utf-8')
943 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
945 self._downloader.trouble(u'ERROR: unable to extract video uploader')
947 video_uploader = mobj.group(1).decode('utf-8')
949 # Extract video thumbnail
950 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
952 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
954 video_thumbnail = mobj.group(1).decode('utf-8')
956 # Extract video description
957 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
959 self._downloader.trouble(u'ERROR: unable to extract video description')
961 video_description = mobj.group(1).decode('utf-8')
962 if not video_description:
963 video_description = 'No description available.'
965 # Extract video height and width
966 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
968 self._downloader.trouble(u'ERROR: unable to extract video height')
970 yv_video_height = mobj.group(1)
972 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
974 self._downloader.trouble(u'ERROR: unable to extract video width')
976 yv_video_width = mobj.group(1)
978 # Retrieve video playlist to extract media URL
979 # I'm not completely sure what all these options are, but we
980 # seem to need most of them, otherwise the server sends a 401.
981 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
982 yv_bitrate = '700' # according to Wikipedia this is hard-coded
983 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
984 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
985 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
987 self.report_download_webpage(video_id)
988 webpage = compat_urllib_request.urlopen(request).read()
989 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
990 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
993 # Extract media URL from playlist XML
994 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
996 self._downloader.trouble(u'ERROR: Unable to extract media URL')
998 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
999 video_url = unescapeHTML(video_url)
1002 'id': video_id.decode('utf-8'),
1004 'uploader': video_uploader,
1005 'upload_date': None,
1006 'title': video_title,
1007 'ext': video_extension.decode('utf-8'),
1008 'thumbnail': video_thumbnail.decode('utf-8'),
1009 'description': video_description,
1013 class VimeoIE(InfoExtractor):
1014 """Information extractor for vimeo.com."""
1016 # _VALID_URL matches Vimeo URLs
1017 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1020 def __init__(self, downloader=None):
1021 InfoExtractor.__init__(self, downloader)
1023 def report_download_webpage(self, video_id):
1024 """Report webpage download."""
1025 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1027 def report_extraction(self, video_id):
1028 """Report information extraction."""
1029 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1031 def _real_extract(self, url, new_video=True):
1032 # Extract ID from URL
1033 mobj = re.match(self._VALID_URL, url)
1035 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1038 video_id = mobj.group('id')
1039 if not mobj.group('proto'):
1040 url = 'https://' + url
1041 if mobj.group('direct_link'):
1042 url = 'https://vimeo.com/' + video_id
1044 # Retrieve video webpage to extract further information
1045 request = compat_urllib_request.Request(url, None, std_headers)
1047 self.report_download_webpage(video_id)
1048 webpage_bytes = compat_urllib_request.urlopen(request).read()
1049 webpage = webpage_bytes.decode('utf-8')
1050 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1051 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1054 # Now we begin extracting as much information as we can from what we
1055 # retrieved. First we extract the information common to all extractors,
1056 # and latter we extract those that are Vimeo specific.
1057 self.report_extraction(video_id)
1059 # Extract the config JSON
1061 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1062 config = json.loads(config)
1064 self._downloader.trouble(u'ERROR: unable to extract info section')
1068 video_title = config["video"]["title"]
1070 # Extract uploader and uploader_id
1071 video_uploader = config["video"]["owner"]["name"]
1072 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1074 # Extract video thumbnail
1075 video_thumbnail = config["video"]["thumbnail"]
1077 # Extract video description
1078 video_description = get_element_by_attribute("itemprop", "description", webpage)
1079 if video_description: video_description = clean_html(video_description)
1080 else: video_description = ''
1082 # Extract upload date
1083 video_upload_date = None
1084 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1085 if mobj is not None:
1086 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1088 # Vimeo specific: extract request signature and timestamp
1089 sig = config['request']['signature']
1090 timestamp = config['request']['timestamp']
1092 # Vimeo specific: extract video codec and quality information
1093 # First consider quality, then codecs, then take everything
1094 # TODO bind to format param
1095 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1096 files = { 'hd': [], 'sd': [], 'other': []}
1097 for codec_name, codec_extension in codecs:
1098 if codec_name in config["video"]["files"]:
1099 if 'hd' in config["video"]["files"][codec_name]:
1100 files['hd'].append((codec_name, codec_extension, 'hd'))
1101 elif 'sd' in config["video"]["files"][codec_name]:
1102 files['sd'].append((codec_name, codec_extension, 'sd'))
1104 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1106 for quality in ('hd', 'sd', 'other'):
1107 if len(files[quality]) > 0:
1108 video_quality = files[quality][0][2]
1109 video_codec = files[quality][0][0]
1110 video_extension = files[quality][0][1]
1111 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1114 self._downloader.trouble(u'ERROR: no known codec found')
1117 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1118 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1123 'uploader': video_uploader,
1124 'uploader_id': video_uploader_id,
1125 'upload_date': video_upload_date,
1126 'title': video_title,
1127 'ext': video_extension,
1128 'thumbnail': video_thumbnail,
1129 'description': video_description,
1133 class ArteTvIE(InfoExtractor):
1134 """arte.tv information extractor."""
1136 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1137 _LIVE_URL = r'index-[0-9]+\.html$'
1139 IE_NAME = u'arte.tv'
1141 def __init__(self, downloader=None):
1142 InfoExtractor.__init__(self, downloader)
1144 def report_download_webpage(self, video_id):
1145 """Report webpage download."""
1146 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1148 def report_extraction(self, video_id):
1149 """Report information extraction."""
1150 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1152 def fetch_webpage(self, url):
1153 request = compat_urllib_request.Request(url)
1155 self.report_download_webpage(url)
1156 webpage = compat_urllib_request.urlopen(request).read()
1157 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1158 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1160 except ValueError as err:
1161 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1165 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1166 page = self.fetch_webpage(url)
1167 mobj = re.search(regex, page, regexFlags)
1171 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1174 for (i, key, err) in matchTuples:
1175 if mobj.group(i) is None:
1176 self._downloader.trouble(err)
1179 info[key] = mobj.group(i)
1183 def extractLiveStream(self, url):
1184 video_lang = url.split('/')[-4]
1185 info = self.grep_webpage(
1187 r'src="(.*?/videothek_js.*?\.js)',
1190 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1193 http_host = url.split('/')[2]
1194 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1195 info = self.grep_webpage(
1197 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1198 '(http://.*?\.swf).*?' +
1202 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1203 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1204 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1207 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1209 def extractPlus7Stream(self, url):
1210 video_lang = url.split('/')[-3]
1211 info = self.grep_webpage(
1213 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1216 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1219 next_url = compat_urllib_parse.unquote(info.get('url'))
1220 info = self.grep_webpage(
1222 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1225 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1228 next_url = compat_urllib_parse.unquote(info.get('url'))
1230 info = self.grep_webpage(
1232 r'<video id="(.*?)".*?>.*?' +
1233 '<name>(.*?)</name>.*?' +
1234 '<dateVideo>(.*?)</dateVideo>.*?' +
1235 '<url quality="hd">(.*?)</url>',
1238 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1239 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1240 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1241 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1246 'id': info.get('id'),
1247 'url': compat_urllib_parse.unquote(info.get('url')),
1248 'uploader': u'arte.tv',
1249 'upload_date': info.get('date'),
1250 'title': info.get('title').decode('utf-8'),
1256 def _real_extract(self, url):
1257 video_id = url.split('/')[-1]
1258 self.report_extraction(video_id)
1260 if re.search(self._LIVE_URL, video_id) is not None:
1261 self.extractLiveStream(url)
1264 info = self.extractPlus7Stream(url)
1269 class GenericIE(InfoExtractor):
1270 """Generic last-resort information extractor."""
1273 IE_NAME = u'generic'
1275 def __init__(self, downloader=None):
1276 InfoExtractor.__init__(self, downloader)
1278 def report_download_webpage(self, video_id):
1279 """Report webpage download."""
1280 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1281 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1283 def report_extraction(self, video_id):
1284 """Report information extraction."""
1285 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1287 def report_following_redirect(self, new_url):
1288 """Report information extraction."""
1289 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1291 def _test_redirect(self, url):
1292 """Check if it is a redirect, like url shorteners, in case restart chain."""
1293 class HeadRequest(compat_urllib_request.Request):
1294 def get_method(self):
1297 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1299 Subclass the HTTPRedirectHandler to make it use our
1300 HeadRequest also on the redirected URL
1302 def redirect_request(self, req, fp, code, msg, headers, newurl):
1303 if code in (301, 302, 303, 307):
1304 newurl = newurl.replace(' ', '%20')
1305 newheaders = dict((k,v) for k,v in req.headers.items()
1306 if k.lower() not in ("content-length", "content-type"))
1307 return HeadRequest(newurl,
1309 origin_req_host=req.get_origin_req_host(),
1312 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1314 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1316 Fallback to GET if HEAD is not allowed (405 HTTP error)
1318 def http_error_405(self, req, fp, code, msg, headers):
1322 newheaders = dict((k,v) for k,v in req.headers.items()
1323 if k.lower() not in ("content-length", "content-type"))
1324 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1326 origin_req_host=req.get_origin_req_host(),
1330 opener = compat_urllib_request.OpenerDirector()
1331 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1332 HTTPMethodFallback, HEADRedirectHandler,
1333 compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1334 opener.add_handler(handler())
1336 response = opener.open(HeadRequest(url))
1337 new_url = response.geturl()
1342 self.report_following_redirect(new_url)
1343 self._downloader.download([new_url])
1346 def _real_extract(self, url):
1347 if self._test_redirect(url): return
1349 video_id = url.split('/')[-1]
1350 request = compat_urllib_request.Request(url)
1352 self.report_download_webpage(video_id)
1353 webpage = compat_urllib_request.urlopen(request).read()
1354 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1355 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1357 except ValueError as err:
1358 # since this is the last-resort InfoExtractor, if
1359 # this error is thrown, it'll be thrown here
1360 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1363 self.report_extraction(video_id)
1364 # Start with something easy: JW Player in SWFObject
1365 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1367 # Broaden the search a little bit
1368 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1370 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1373 # It's possible that one of the regexes
1374 # matched, but returned an empty group:
1375 if mobj.group(1) is None:
1376 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1379 video_url = compat_urllib_parse.unquote(mobj.group(1))
1380 video_id = os.path.basename(video_url)
1382 # here's a fun little line of code for you:
1383 video_extension = os.path.splitext(video_id)[1][1:]
1384 video_id = os.path.splitext(video_id)[0]
1386 # it's tempting to parse this further, but you would
1387 # have to take into account all the variations like
1388 # Video Title - Site Name
1389 # Site Name | Video Title
1390 # Video Title - Tagline | Site Name
1391 # and so on and so forth; it's just not practical
1392 mobj = re.search(r'<title>(.*)</title>', webpage)
1394 self._downloader.trouble(u'ERROR: unable to extract title')
1396 video_title = mobj.group(1)
1398 # video uploader is domain name
1399 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1401 self._downloader.trouble(u'ERROR: unable to extract title')
1403 video_uploader = mobj.group(1)
1408 'uploader': video_uploader,
1409 'upload_date': None,
1410 'title': video_title,
1411 'ext': video_extension,
1415 class YoutubeSearchIE(InfoExtractor):
1416 """Information Extractor for YouTube search queries."""
1417 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1418 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1419 _max_youtube_results = 1000
1420 IE_NAME = u'youtube:search'
1422 def __init__(self, downloader=None):
1423 InfoExtractor.__init__(self, downloader)
1425 def report_download_page(self, query, pagenum):
1426 """Report attempt to download search page with given number."""
1427 query = query.decode(preferredencoding())
1428 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1430 def _real_extract(self, query):
1431 mobj = re.match(self._VALID_URL, query)
1433 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1436 prefix, query = query.split(':')
1438 query = query.encode('utf-8')
1440 self._download_n_results(query, 1)
1442 elif prefix == 'all':
1443 self._download_n_results(query, self._max_youtube_results)
1449 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1451 elif n > self._max_youtube_results:
1452 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1453 n = self._max_youtube_results
1454 self._download_n_results(query, n)
1456 except ValueError: # parsing prefix as integer fails
1457 self._download_n_results(query, 1)
1460 def _download_n_results(self, query, n):
1461 """Downloads a specified number of results for a query"""
1467 while (50 * pagenum) < limit:
1468 self.report_download_page(query, pagenum+1)
1469 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1470 request = compat_urllib_request.Request(result_url)
1472 data = compat_urllib_request.urlopen(request).read()
1473 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1474 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1476 api_response = json.loads(data)['data']
1478 new_ids = list(video['id'] for video in api_response['items'])
1479 video_ids += new_ids
1481 limit = min(n, api_response['totalItems'])
1484 if len(video_ids) > n:
1485 video_ids = video_ids[:n]
1486 for id in video_ids:
1487 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1491 class GoogleSearchIE(InfoExtractor):
1492 """Information Extractor for Google Video search queries."""
1493 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1494 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1495 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1496 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1497 _max_google_results = 1000
1498 IE_NAME = u'video.google:search'
1500 def __init__(self, downloader=None):
1501 InfoExtractor.__init__(self, downloader)
1503 def report_download_page(self, query, pagenum):
1504 """Report attempt to download playlist page with given number."""
1505 query = query.decode(preferredencoding())
1506 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1508 def _real_extract(self, query):
1509 mobj = re.match(self._VALID_URL, query)
1511 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1514 prefix, query = query.split(':')
1516 query = query.encode('utf-8')
1518 self._download_n_results(query, 1)
1520 elif prefix == 'all':
1521 self._download_n_results(query, self._max_google_results)
1527 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1529 elif n > self._max_google_results:
1530 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1531 n = self._max_google_results
1532 self._download_n_results(query, n)
1534 except ValueError: # parsing prefix as integer fails
1535 self._download_n_results(query, 1)
1538 def _download_n_results(self, query, n):
1539 """Downloads a specified number of results for a query"""
1545 self.report_download_page(query, pagenum)
1546 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1547 request = compat_urllib_request.Request(result_url)
1549 page = compat_urllib_request.urlopen(request).read()
1550 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1551 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1554 # Extract video identifiers
1555 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1556 video_id = mobj.group(1)
1557 if video_id not in video_ids:
1558 video_ids.append(video_id)
1559 if len(video_ids) == n:
1560 # Specified n videos reached
1561 for id in video_ids:
1562 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1565 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1566 for id in video_ids:
1567 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1570 pagenum = pagenum + 1
1573 class YahooSearchIE(InfoExtractor):
1574 """Information Extractor for Yahoo! Video search queries."""
1577 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1578 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1579 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1580 _MORE_PAGES_INDICATOR = r'\s*Next'
1581 _max_yahoo_results = 1000
1582 IE_NAME = u'video.yahoo:search'
1584 def __init__(self, downloader=None):
1585 InfoExtractor.__init__(self, downloader)
1587 def report_download_page(self, query, pagenum):
1588 """Report attempt to download playlist page with given number."""
1589 query = query.decode(preferredencoding())
1590 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1592 def _real_extract(self, query):
1593 mobj = re.match(self._VALID_URL, query)
1595 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1598 prefix, query = query.split(':')
1600 query = query.encode('utf-8')
1602 self._download_n_results(query, 1)
1604 elif prefix == 'all':
1605 self._download_n_results(query, self._max_yahoo_results)
1611 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1613 elif n > self._max_yahoo_results:
1614 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1615 n = self._max_yahoo_results
1616 self._download_n_results(query, n)
1618 except ValueError: # parsing prefix as integer fails
1619 self._download_n_results(query, 1)
1622 def _download_n_results(self, query, n):
1623 """Downloads a specified number of results for a query"""
1626 already_seen = set()
1630 self.report_download_page(query, pagenum)
1631 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1632 request = compat_urllib_request.Request(result_url)
1634 page = compat_urllib_request.urlopen(request).read()
1635 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1636 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1639 # Extract video identifiers
1640 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1641 video_id = mobj.group(1)
1642 if video_id not in already_seen:
1643 video_ids.append(video_id)
1644 already_seen.add(video_id)
1645 if len(video_ids) == n:
1646 # Specified n videos reached
1647 for id in video_ids:
1648 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1651 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1652 for id in video_ids:
1653 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1656 pagenum = pagenum + 1
1659 class YoutubePlaylistIE(InfoExtractor):
1660 """Information Extractor for YouTube playlists."""
1662 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1663 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1664 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1665 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1666 IE_NAME = u'youtube:playlist'
1668 def __init__(self, downloader=None):
1669 InfoExtractor.__init__(self, downloader)
1671 def report_download_page(self, playlist_id, pagenum):
1672 """Report attempt to download playlist page with given number."""
1673 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1675 def _real_extract(self, url):
1676 # Extract playlist id
1677 mobj = re.match(self._VALID_URL, url)
1679 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1683 if mobj.group(3) is not None:
1684 self._downloader.download([mobj.group(3)])
1687 # Download playlist pages
1688 # prefix is 'p' as default for playlists but there are other types that need extra care
1689 playlist_prefix = mobj.group(1)
1690 if playlist_prefix == 'a':
1691 playlist_access = 'artist'
1693 playlist_prefix = 'p'
1694 playlist_access = 'view_play_list'
1695 playlist_id = mobj.group(2)
1700 self.report_download_page(playlist_id, pagenum)
1701 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1702 request = compat_urllib_request.Request(url)
1704 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1705 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1706 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1709 # Extract video identifiers
1711 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1712 if mobj.group(1) not in ids_in_page:
1713 ids_in_page.append(mobj.group(1))
1714 video_ids.extend(ids_in_page)
1716 if self._MORE_PAGES_INDICATOR not in page:
1718 pagenum = pagenum + 1
1720 total = len(video_ids)
1722 playliststart = self._downloader.params.get('playliststart', 1) - 1
1723 playlistend = self._downloader.params.get('playlistend', -1)
1724 if playlistend == -1:
1725 video_ids = video_ids[playliststart:]
1727 video_ids = video_ids[playliststart:playlistend]
1729 if len(video_ids) == total:
1730 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1732 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1734 for id in video_ids:
1735 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1739 class YoutubeChannelIE(InfoExtractor):
1740 """Information Extractor for YouTube channels."""
1742 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1743 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1744 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1745 IE_NAME = u'youtube:channel'
1747 def report_download_page(self, channel_id, pagenum):
1748 """Report attempt to download channel page with given number."""
1749 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1751 def _real_extract(self, url):
1752 # Extract channel id
1753 mobj = re.match(self._VALID_URL, url)
1755 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1758 # Download channel pages
1759 channel_id = mobj.group(1)
1764 self.report_download_page(channel_id, pagenum)
1765 url = self._TEMPLATE_URL % (channel_id, pagenum)
1766 request = compat_urllib_request.Request(url)
1768 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1769 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1770 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1773 # Extract video identifiers
1775 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1776 if mobj.group(1) not in ids_in_page:
1777 ids_in_page.append(mobj.group(1))
1778 video_ids.extend(ids_in_page)
1780 if self._MORE_PAGES_INDICATOR not in page:
1782 pagenum = pagenum + 1
1784 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1786 for id in video_ids:
1787 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1791 class YoutubeUserIE(InfoExtractor):
1792 """Information Extractor for YouTube users."""
1794 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1795 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1796 _GDATA_PAGE_SIZE = 50
1797 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1798 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1799 IE_NAME = u'youtube:user'
1801 def __init__(self, downloader=None):
1802 InfoExtractor.__init__(self, downloader)
1804 def report_download_page(self, username, start_index):
1805 """Report attempt to download user page."""
1806 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1807 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1809 def _real_extract(self, url):
1811 mobj = re.match(self._VALID_URL, url)
1813 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1816 username = mobj.group(1)
1818 # Download video ids using YouTube Data API. Result size per
1819 # query is limited (currently to 50 videos) so we need to query
1820 # page by page until there are no video ids - it means we got
1827 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1828 self.report_download_page(username, start_index)
1830 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1833 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1834 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1835 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1838 # Extract video identifiers
1841 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1842 if mobj.group(1) not in ids_in_page:
1843 ids_in_page.append(mobj.group(1))
1845 video_ids.extend(ids_in_page)
1847 # A little optimization - if current page is not
1848 # "full", ie. does not contain PAGE_SIZE video ids then
1849 # we can assume that this page is the last one - there
1850 # are no more ids on further pages - no need to query
1853 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1858 all_ids_count = len(video_ids)
1859 playliststart = self._downloader.params.get('playliststart', 1) - 1
1860 playlistend = self._downloader.params.get('playlistend', -1)
1862 if playlistend == -1:
1863 video_ids = video_ids[playliststart:]
1865 video_ids = video_ids[playliststart:playlistend]
1867 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1868 (username, all_ids_count, len(video_ids)))
1870 for video_id in video_ids:
1871 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1874 class BlipTVUserIE(InfoExtractor):
1875 """Information Extractor for blip.tv users."""
1877 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1879 IE_NAME = u'blip.tv:user'
1881 def __init__(self, downloader=None):
1882 InfoExtractor.__init__(self, downloader)
1884 def report_download_page(self, username, pagenum):
1885 """Report attempt to download user page."""
1886 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1887 (self.IE_NAME, username, pagenum))
1889 def _real_extract(self, url):
1891 mobj = re.match(self._VALID_URL, url)
1893 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1896 username = mobj.group(1)
1898 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1900 request = compat_urllib_request.Request(url)
1903 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1904 mobj = re.search(r'data-users-id="([^"]+)"', page)
1905 page_base = page_base % mobj.group(1)
1906 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1907 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1911 # Download video ids using BlipTV Ajax calls. Result size per
1912 # query is limited (currently to 12 videos) so we need to query
1913 # page by page until there are no video ids - it means we got
1920 self.report_download_page(username, pagenum)
1922 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1925 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1926 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1927 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1930 # Extract video identifiers
1933 for mobj in re.finditer(r'href="/([^"]+)"', page):
1934 if mobj.group(1) not in ids_in_page:
1935 ids_in_page.append(unescapeHTML(mobj.group(1)))
1937 video_ids.extend(ids_in_page)
1939 # A little optimization - if current page is not
1940 # "full", ie. does not contain PAGE_SIZE video ids then
1941 # we can assume that this page is the last one - there
1942 # are no more ids on further pages - no need to query
1945 if len(ids_in_page) < self._PAGE_SIZE:
1950 all_ids_count = len(video_ids)
1951 playliststart = self._downloader.params.get('playliststart', 1) - 1
1952 playlistend = self._downloader.params.get('playlistend', -1)
1954 if playlistend == -1:
1955 video_ids = video_ids[playliststart:]
1957 video_ids = video_ids[playliststart:playlistend]
1959 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1960 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1962 for video_id in video_ids:
1963 self._downloader.download([u'http://blip.tv/'+video_id])
1966 class DepositFilesIE(InfoExtractor):
1967 """Information extractor for depositfiles.com"""
1969 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1971 def report_download_webpage(self, file_id):
1972 """Report webpage download."""
1973 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1975 def report_extraction(self, file_id):
1976 """Report information extraction."""
1977 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1979 def _real_extract(self, url):
1980 file_id = url.split('/')[-1]
1981 # Rebuild url in english locale
1982 url = 'http://depositfiles.com/en/files/' + file_id
1984 # Retrieve file webpage with 'Free download' button pressed
1985 free_download_indication = { 'gateway_result' : '1' }
1986 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1988 self.report_download_webpage(file_id)
1989 webpage = compat_urllib_request.urlopen(request).read()
1990 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1991 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1994 # Search for the real file URL
1995 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1996 if (mobj is None) or (mobj.group(1) is None):
1997 # Try to figure out reason of the error.
1998 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1999 if (mobj is not None) and (mobj.group(1) is not None):
2000 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
2001 self._downloader.trouble(u'ERROR: %s' % restriction_message)
2003 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
2006 file_url = mobj.group(1)
2007 file_extension = os.path.splitext(file_url)[1][1:]
2009 # Search for file title
2010 mobj = re.search(r'<b title="(.*?)">', webpage)
2012 self._downloader.trouble(u'ERROR: unable to extract title')
2014 file_title = mobj.group(1).decode('utf-8')
2017 'id': file_id.decode('utf-8'),
2018 'url': file_url.decode('utf-8'),
2020 'upload_date': None,
2021 'title': file_title,
2022 'ext': file_extension.decode('utf-8'),
2026 class FacebookIE(InfoExtractor):
2027 """Information Extractor for Facebook"""
2029 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2030 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2031 _NETRC_MACHINE = 'facebook'
2032 IE_NAME = u'facebook'
2034 def report_login(self):
2035 """Report attempt to log in."""
2036 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2038 def _real_initialize(self):
2039 if self._downloader is None:
2044 downloader_params = self._downloader.params
2046 # Attempt to use provided username and password or .netrc data
2047 if downloader_params.get('username', None) is not None:
2048 useremail = downloader_params['username']
2049 password = downloader_params['password']
2050 elif downloader_params.get('usenetrc', False):
2052 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2053 if info is not None:
2057 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2058 except (IOError, netrc.NetrcParseError) as err:
2059 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2062 if useremail is None:
2071 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2074 login_results = compat_urllib_request.urlopen(request).read()
2075 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2076 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2078 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2079 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2082 def _real_extract(self, url):
2083 mobj = re.match(self._VALID_URL, url)
2085 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2087 video_id = mobj.group('ID')
2089 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2090 webpage = self._download_webpage(url, video_id)
2092 BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2093 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2094 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2096 raise ExtractorError(u'Cannot parse data')
2097 data = dict(json.loads(m.group(1)))
2098 params_raw = compat_urllib_parse.unquote(data['params'])
2099 params = json.loads(params_raw)
2100 video_url = params['hd_src']
2102 video_url = params['sd_src']
2104 raise ExtractorError(u'Cannot find video URL')
2105 video_duration = int(params['video_duration'])
2107 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2109 raise ExtractorError(u'Cannot find title in webpage')
2110 video_title = unescapeHTML(m.group(1))
2114 'title': video_title,
2117 'duration': video_duration,
2118 'thumbnail': params['thumbnail_src'],
2123 class BlipTVIE(InfoExtractor):
2124 """Information extractor for blip.tv"""
2126 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2127 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2128 IE_NAME = u'blip.tv'
2130 def report_extraction(self, file_id):
2131 """Report information extraction."""
2132 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2134 def report_direct_download(self, title):
2135 """Report information extraction."""
2136 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2138 def _real_extract(self, url):
2139 mobj = re.match(self._VALID_URL, url)
2141 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2148 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2149 request = compat_urllib_request.Request(json_url)
2150 request.add_header('User-Agent', 'iTunes/10.6.1')
2151 self.report_extraction(mobj.group(1))
2154 urlh = compat_urllib_request.urlopen(request)
2155 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2156 basename = url.split('/')[-1]
2157 title,ext = os.path.splitext(basename)
2158 title = title.decode('UTF-8')
2159 ext = ext.replace('.', '')
2160 self.report_direct_download(title)
2165 'upload_date': None,
2170 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2171 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2172 if info is None: # Regular URL
2174 json_code_bytes = urlh.read()
2175 json_code = json_code_bytes.decode('utf-8')
2176 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2177 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2181 json_data = json.loads(json_code)
2182 if 'Post' in json_data:
2183 data = json_data['Post']
2187 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2188 video_url = data['media']['url']
2189 umobj = re.match(self._URL_EXT, video_url)
2191 raise ValueError('Can not determine filename extension')
2192 ext = umobj.group(1)
2195 'id': data['item_id'],
2197 'uploader': data['display_name'],
2198 'upload_date': upload_date,
2199 'title': data['title'],
2201 'format': data['media']['mimeType'],
2202 'thumbnail': data['thumbnailUrl'],
2203 'description': data['description'],
2204 'player_url': data['embedUrl'],
2205 'user_agent': 'iTunes/10.6.1',
2207 except (ValueError,KeyError) as err:
2208 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2214 class MyVideoIE(InfoExtractor):
2215 """Information Extractor for myvideo.de."""
2217 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2218 IE_NAME = u'myvideo'
2220 def __init__(self, downloader=None):
2221 InfoExtractor.__init__(self, downloader)
2223 def report_extraction(self, video_id):
2224 """Report information extraction."""
2225 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2227 def _real_extract(self,url):
2228 mobj = re.match(self._VALID_URL, url)
2230 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2233 video_id = mobj.group(1)
2236 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2237 webpage = self._download_webpage(webpage_url, video_id)
2239 self.report_extraction(video_id)
2240 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
2243 self._downloader.trouble(u'ERROR: unable to extract media URL')
2245 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2247 mobj = re.search('<title>([^<]+)</title>', webpage)
2249 self._downloader.trouble(u'ERROR: unable to extract title')
2252 video_title = mobj.group(1)
2258 'upload_date': None,
2259 'title': video_title,
2263 class ComedyCentralIE(InfoExtractor):
2264 """Information extractor for The Daily Show and Colbert Report """
2266 # urls can be abbreviations like :thedailyshow or :colbert
2267 # urls for episodes like:
2268 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2269 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2270 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2271 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2272 |(https?://)?(www\.)?
2273 (?P<showname>thedailyshow|colbertnation)\.com/
2274 (full-episodes/(?P<episode>.*)|
2276 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2277 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2280 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2282 _video_extensions = {
2290 _video_dimensions = {
2299 def suitable(self, url):
2300 """Receives a URL and returns True if suitable for this IE."""
2301 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2303 def report_extraction(self, episode_id):
2304 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2306 def report_config_download(self, episode_id, media_id):
2307 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2309 def report_index_download(self, episode_id):
2310 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2312 def _print_formats(self, formats):
2313 print('Available formats:')
2315 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2318 def _real_extract(self, url):
2319 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2321 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2324 if mobj.group('shortname'):
2325 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2326 url = u'http://www.thedailyshow.com/full-episodes/'
2328 url = u'http://www.colbertnation.com/full-episodes/'
2329 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2330 assert mobj is not None
2332 if mobj.group('clip'):
2333 if mobj.group('showname') == 'thedailyshow':
2334 epTitle = mobj.group('tdstitle')
2336 epTitle = mobj.group('cntitle')
2339 dlNewest = not mobj.group('episode')
2341 epTitle = mobj.group('showname')
2343 epTitle = mobj.group('episode')
2345 req = compat_urllib_request.Request(url)
2346 self.report_extraction(epTitle)
2348 htmlHandle = compat_urllib_request.urlopen(req)
2349 html = htmlHandle.read()
2350 webpage = html.decode('utf-8')
2351 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2352 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2355 url = htmlHandle.geturl()
2356 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2358 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2360 if mobj.group('episode') == '':
2361 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2363 epTitle = mobj.group('episode')
2365 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2367 if len(mMovieParams) == 0:
2368 # The Colbert Report embeds the information in a without
2369 # a URL prefix; so extract the alternate reference
2370 # and then add the URL prefix manually.
2372 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2373 if len(altMovieParams) == 0:
2374 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2377 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2379 uri = mMovieParams[0][1]
2380 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2381 self.report_index_download(epTitle)
2383 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2384 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2385 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2390 idoc = xml.etree.ElementTree.fromstring(indexXml)
2391 itemEls = idoc.findall('.//item')
2392 for partNum,itemEl in enumerate(itemEls):
2393 mediaId = itemEl.findall('./guid')[0].text
2394 shortMediaId = mediaId.split(':')[-1]
2395 showId = mediaId.split(':')[-2].replace('.com', '')
2396 officialTitle = itemEl.findall('./title')[0].text
2397 officialDate = itemEl.findall('./pubDate')[0].text
2399 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2400 compat_urllib_parse.urlencode({'uri': mediaId}))
2401 configReq = compat_urllib_request.Request(configUrl)
2402 self.report_config_download(epTitle, shortMediaId)
2404 configXml = compat_urllib_request.urlopen(configReq).read()
2405 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2406 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2409 cdoc = xml.etree.ElementTree.fromstring(configXml)
2411 for rendition in cdoc.findall('.//rendition'):
2412 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2416 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2419 if self._downloader.params.get('listformats', None):
2420 self._print_formats([i[0] for i in turls])
2423 # For now, just pick the highest bitrate
2424 format,rtmp_video_url = turls[-1]
2426 # Get the format arg from the arg stream
2427 req_format = self._downloader.params.get('format', None)
2429 # Select format if we can find one
2432 format, rtmp_video_url = f, v
2435 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2437 raise ExtractorError(u'Cannot transform RTMP url')
2438 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2439 video_url = base + m.group('finalid')
2441 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2446 'upload_date': officialDate,
2451 'description': officialTitle,
2453 results.append(info)
2458 class EscapistIE(InfoExtractor):
2459 """Information extractor for The Escapist """
2461 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2462 IE_NAME = u'escapist'
2464 def report_extraction(self, showName):
2465 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2467 def report_config_download(self, showName):
2468 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2470 def _real_extract(self, url):
2471 mobj = re.match(self._VALID_URL, url)
2473 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2475 showName = mobj.group('showname')
2476 videoId = mobj.group('episode')
2478 self.report_extraction(showName)
2480 webPage = compat_urllib_request.urlopen(url)
2481 webPageBytes = webPage.read()
2482 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2483 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2484 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2485 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2488 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2489 description = unescapeHTML(descMatch.group(1))
2490 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2491 imgUrl = unescapeHTML(imgMatch.group(1))
2492 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2493 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2494 configUrlMatch = re.search('config=(.*)$', playerUrl)
2495 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2497 self.report_config_download(showName)
2499 configJSON = compat_urllib_request.urlopen(configUrl)
2500 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2501 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2502 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2503 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2506 # Technically, it's JavaScript, not JSON
2507 configJSON = configJSON.replace("'", '"')
2510 config = json.loads(configJSON)
2511 except (ValueError,) as err:
2512 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2515 playlist = config['playlist']
2516 videoUrl = playlist[1]['url']
2521 'uploader': showName,
2522 'upload_date': None,
2525 'thumbnail': imgUrl,
2526 'description': description,
2527 'player_url': playerUrl,
2532 class CollegeHumorIE(InfoExtractor):
2533 """Information extractor for collegehumor.com"""
2536 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2537 IE_NAME = u'collegehumor'
2539 def report_manifest(self, video_id):
2540 """Report information extraction."""
2541 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2543 def report_extraction(self, video_id):
2544 """Report information extraction."""
2545 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2547 def _real_extract(self, url):
2548 mobj = re.match(self._VALID_URL, url)
2550 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2552 video_id = mobj.group('videoid')
2557 'upload_date': None,
2560 self.report_extraction(video_id)
2561 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2563 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2564 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2565 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2568 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2570 videoNode = mdoc.findall('./video')[0]
2571 info['description'] = videoNode.findall('./description')[0].text
2572 info['title'] = videoNode.findall('./caption')[0].text
2573 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2574 manifest_url = videoNode.findall('./file')[0].text
2576 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2579 manifest_url += '?hdcore=2.10.3'
2580 self.report_manifest(video_id)
2582 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2583 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2584 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2587 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2589 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2590 node_id = media_node.attrib['url']
2591 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2592 except IndexError as err:
2593 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2596 url_pr = compat_urllib_parse_urlparse(manifest_url)
2597 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2604 class XVideosIE(InfoExtractor):
2605 """Information extractor for xvideos.com"""
2607 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2608 IE_NAME = u'xvideos'
2610 def report_extraction(self, video_id):
2611 """Report information extraction."""
2612 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2614 def _real_extract(self, url):
2615 mobj = re.match(self._VALID_URL, url)
2617 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2619 video_id = mobj.group(1)
2621 webpage = self._download_webpage(url, video_id)
2623 self.report_extraction(video_id)
2627 mobj = re.search(r'flv_url=(.+?)&', webpage)
2629 self._downloader.trouble(u'ERROR: unable to extract video url')
2631 video_url = compat_urllib_parse.unquote(mobj.group(1))
2635 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2637 self._downloader.trouble(u'ERROR: unable to extract video title')
2639 video_title = mobj.group(1)
2642 # Extract video thumbnail
2643 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2645 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2647 video_thumbnail = mobj.group(0)
2653 'upload_date': None,
2654 'title': video_title,
2656 'thumbnail': video_thumbnail,
2657 'description': None,
2663 class SoundcloudIE(InfoExtractor):
2664 """Information extractor for soundcloud.com
2665 To access the media, the uid of the song and a stream token
2666 must be extracted from the page source and the script must make
2667 a request to media.soundcloud.com/crossdomain.xml. Then
2668 the media can be grabbed by requesting from an url composed
2669 of the stream token and uid
2672 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2673 IE_NAME = u'soundcloud'
2675 def __init__(self, downloader=None):
2676 InfoExtractor.__init__(self, downloader)
2678 def report_resolve(self, video_id):
2679 """Report information extraction."""
2680 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2682 def report_extraction(self, video_id):
2683 """Report information extraction."""
2684 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2686 def _real_extract(self, url):
2687 mobj = re.match(self._VALID_URL, url)
2689 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2692 # extract uploader (which is in the url)
2693 uploader = mobj.group(1)
2694 # extract simple title (uploader + slug of song title)
2695 slug_title = mobj.group(2)
2696 simple_title = uploader + u'-' + slug_title
2698 self.report_resolve('%s/%s' % (uploader, slug_title))
2700 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2701 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2702 request = compat_urllib_request.Request(resolv_url)
2704 info_json_bytes = compat_urllib_request.urlopen(request).read()
2705 info_json = info_json_bytes.decode('utf-8')
2706 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2707 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2710 info = json.loads(info_json)
2711 video_id = info['id']
2712 self.report_extraction('%s/%s' % (uploader, slug_title))
2714 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2715 request = compat_urllib_request.Request(streams_url)
2717 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2718 stream_json = stream_json_bytes.decode('utf-8')
2719 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2720 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2723 streams = json.loads(stream_json)
2724 mediaURL = streams['http_mp3_128_url']
2729 'uploader': info['user']['username'],
2730 'upload_date': info['created_at'],
2731 'title': info['title'],
2733 'description': info['description'],
2737 class InfoQIE(InfoExtractor):
2738 """Information extractor for infoq.com"""
2739 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2741 def report_extraction(self, video_id):
2742 """Report information extraction."""
2743 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2745 def _real_extract(self, url):
2746 mobj = re.match(self._VALID_URL, url)
2748 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2751 webpage = self._download_webpage(url, video_id=url)
2752 self.report_extraction(url)
2755 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2757 self._downloader.trouble(u'ERROR: unable to extract video url')
2759 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2760 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2763 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2765 self._downloader.trouble(u'ERROR: unable to extract video title')
2767 video_title = mobj.group(1)
2769 # Extract description
2770 video_description = u'No description available.'
2771 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2772 if mobj is not None:
2773 video_description = mobj.group(1)
2775 video_filename = video_url.split('/')[-1]
2776 video_id, extension = video_filename.split('.')
2782 'upload_date': None,
2783 'title': video_title,
2784 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2786 'description': video_description,
2791 class MixcloudIE(InfoExtractor):
2792 """Information extractor for www.mixcloud.com"""
2794 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2795 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2796 IE_NAME = u'mixcloud'
2798 def __init__(self, downloader=None):
2799 InfoExtractor.__init__(self, downloader)
2801 def report_download_json(self, file_id):
2802 """Report JSON download."""
2803 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2805 def report_extraction(self, file_id):
2806 """Report information extraction."""
2807 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2809 def get_urls(self, jsonData, fmt, bitrate='best'):
2810 """Get urls from 'audio_formats' section in json"""
2813 bitrate_list = jsonData[fmt]
2814 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2815 bitrate = max(bitrate_list) # select highest
2817 url_list = jsonData[fmt][bitrate]
2818 except TypeError: # we have no bitrate info.
2819 url_list = jsonData[fmt]
2822 def check_urls(self, url_list):
2823 """Returns 1st active url from list"""
2824 for url in url_list:
2826 compat_urllib_request.urlopen(url)
2828 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2833 def _print_formats(self, formats):
2834 print('Available formats:')
2835 for fmt in formats.keys():
2836 for b in formats[fmt]:
2838 ext = formats[fmt][b][0]
2839 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2840 except TypeError: # we have no bitrate info
2841 ext = formats[fmt][0]
2842 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2845 def _real_extract(self, url):
2846 mobj = re.match(self._VALID_URL, url)
2848 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2850 # extract uploader & filename from url
2851 uploader = mobj.group(1).decode('utf-8')
2852 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2854 # construct API request
2855 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2856 # retrieve .json file with links to files
2857 request = compat_urllib_request.Request(file_url)
2859 self.report_download_json(file_url)
2860 jsonData = compat_urllib_request.urlopen(request).read()
2861 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2862 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2866 json_data = json.loads(jsonData)
2867 player_url = json_data['player_swf_url']
2868 formats = dict(json_data['audio_formats'])
2870 req_format = self._downloader.params.get('format', None)
2873 if self._downloader.params.get('listformats', None):
2874 self._print_formats(formats)
2877 if req_format is None or req_format == 'best':
2878 for format_param in formats.keys():
2879 url_list = self.get_urls(formats, format_param)
2881 file_url = self.check_urls(url_list)
2882 if file_url is not None:
2885 if req_format not in formats:
2886 self._downloader.trouble(u'ERROR: format is not available')
2889 url_list = self.get_urls(formats, req_format)
2890 file_url = self.check_urls(url_list)
2891 format_param = req_format
2894 'id': file_id.decode('utf-8'),
2895 'url': file_url.decode('utf-8'),
2896 'uploader': uploader.decode('utf-8'),
2897 'upload_date': None,
2898 'title': json_data['name'],
2899 'ext': file_url.split('.')[-1].decode('utf-8'),
2900 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2901 'thumbnail': json_data['thumbnail_url'],
2902 'description': json_data['description'],
2903 'player_url': player_url.decode('utf-8'),
2906 class StanfordOpenClassroomIE(InfoExtractor):
2907 """Information extractor for Stanford's Open ClassRoom"""
2909 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2910 IE_NAME = u'stanfordoc'
2912 def report_download_webpage(self, objid):
2913 """Report information extraction."""
2914 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2916 def report_extraction(self, video_id):
2917 """Report information extraction."""
2918 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2920 def _real_extract(self, url):
2921 mobj = re.match(self._VALID_URL, url)
2923 raise ExtractorError(u'Invalid URL: %s' % url)
2925 if mobj.group('course') and mobj.group('video'): # A specific video
2926 course = mobj.group('course')
2927 video = mobj.group('video')
2929 'id': course + '_' + video,
2931 'upload_date': None,
2934 self.report_extraction(info['id'])
2935 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2936 xmlUrl = baseUrl + video + '.xml'
2938 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2939 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2940 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2942 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2944 info['title'] = mdoc.findall('./title')[0].text
2945 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2947 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2949 info['ext'] = info['url'].rpartition('.')[2]
2951 elif mobj.group('course'): # A course page
2952 course = mobj.group('course')
2957 'upload_date': None,
2960 coursepage = self._download_webpage(url, info['id'],
2961 note='Downloading course info page',
2962 errnote='Unable to download course info page')
2964 m = re.search('<h1>([^<]+)</h1>', coursepage)
2966 info['title'] = unescapeHTML(m.group(1))
2968 info['title'] = info['id']
2970 m = re.search('<description>([^<]+)</description>', coursepage)
2972 info['description'] = unescapeHTML(m.group(1))
2974 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2977 'type': 'reference',
2978 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2982 for entry in info['list']:
2983 assert entry['type'] == 'reference'
2984 results += self.extract(entry['url'])
2988 'id': 'Stanford OpenClassroom',
2991 'upload_date': None,
2994 self.report_download_webpage(info['id'])
2995 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2997 rootpage = compat_urllib_request.urlopen(rootURL).read()
2998 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2999 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
3002 info['title'] = info['id']
3004 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
3007 'type': 'reference',
3008 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3013 for entry in info['list']:
3014 assert entry['type'] == 'reference'
3015 results += self.extract(entry['url'])
3018 class MTVIE(InfoExtractor):
3019 """Information extractor for MTV.com"""
3021 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3024 def report_extraction(self, video_id):
3025 """Report information extraction."""
3026 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3028 def _real_extract(self, url):
3029 mobj = re.match(self._VALID_URL, url)
3031 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3033 if not mobj.group('proto'):
3034 url = 'http://' + url
3035 video_id = mobj.group('videoid')
3037 webpage = self._download_webpage(url, video_id)
3039 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3041 self._downloader.trouble(u'ERROR: unable to extract song name')
3043 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3044 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3046 self._downloader.trouble(u'ERROR: unable to extract performer')
3048 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3049 video_title = performer + ' - ' + song_name
3051 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3053 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3055 mtvn_uri = mobj.group(1)
3057 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3059 self._downloader.trouble(u'ERROR: unable to extract content id')
3061 content_id = mobj.group(1)
3063 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3064 self.report_extraction(video_id)
3065 request = compat_urllib_request.Request(videogen_url)
3067 metadataXml = compat_urllib_request.urlopen(request).read()
3068 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3069 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3072 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3073 renditions = mdoc.findall('.//rendition')
3075 # For now, always pick the highest quality.
3076 rendition = renditions[-1]
3079 _,_,ext = rendition.attrib['type'].partition('/')
3080 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3081 video_url = rendition.find('./src').text
3083 self._downloader.trouble('Invalid rendition field.')
3089 'uploader': performer,
3090 'upload_date': None,
3091 'title': video_title,
3099 class YoukuIE(InfoExtractor):
3100 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3102 def report_download_webpage(self, file_id):
3103 """Report webpage download."""
3104 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3106 def report_extraction(self, file_id):
3107 """Report information extraction."""
3108 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3111 nowTime = int(time.time() * 1000)
3112 random1 = random.randint(1000,1998)
3113 random2 = random.randint(1000,9999)
3115 return "%d%d%d" %(nowTime,random1,random2)
3117 def _get_file_ID_mix_string(self, seed):
3119 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3121 for i in range(len(source)):
3122 seed = (seed * 211 + 30031 ) % 65536
3123 index = math.floor(seed / 65536 * len(source) )
3124 mixed.append(source[int(index)])
3125 source.remove(source[int(index)])
3126 #return ''.join(mixed)
3129 def _get_file_id(self, fileId, seed):
3130 mixed = self._get_file_ID_mix_string(seed)
3131 ids = fileId.split('*')
3135 realId.append(mixed[int(ch)])
3136 return ''.join(realId)
3138 def _real_extract(self, url):
3139 mobj = re.match(self._VALID_URL, url)
3141 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3143 video_id = mobj.group('ID')
3145 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3147 request = compat_urllib_request.Request(info_url, None, std_headers)
3149 self.report_download_webpage(video_id)
3150 jsondata = compat_urllib_request.urlopen(request).read()
3151 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3152 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3155 self.report_extraction(video_id)
3157 jsonstr = jsondata.decode('utf-8')
3158 config = json.loads(jsonstr)
3160 video_title = config['data'][0]['title']
3161 seed = config['data'][0]['seed']
3163 format = self._downloader.params.get('format', None)
3164 supported_format = list(config['data'][0]['streamfileids'].keys())
3166 if format is None or format == 'best':
3167 if 'hd2' in supported_format:
3172 elif format == 'worst':
3180 fileid = config['data'][0]['streamfileids'][format]
3181 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3182 except (UnicodeDecodeError, ValueError, KeyError):
3183 self._downloader.trouble(u'ERROR: unable to extract info section')
3187 sid = self._gen_sid()
3188 fileid = self._get_file_id(fileid, seed)
3190 #column 8,9 of fileid represent the segment number
3191 #fileid[7:9] should be changed
3192 for index, key in enumerate(keys):
3194 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3195 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3198 'id': '%s_part%02d' % (video_id, index),
3199 'url': download_url,
3201 'upload_date': None,
3202 'title': video_title,
3205 files_info.append(info)
3210 class XNXXIE(InfoExtractor):
3211 """Information extractor for xnxx.com"""
3213 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3215 VIDEO_URL_RE = r'flv_url=(.*?)&'
3216 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3217 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3219 def report_webpage(self, video_id):
3220 """Report information extraction"""
3221 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3223 def report_extraction(self, video_id):
3224 """Report information extraction"""
3225 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3227 def _real_extract(self, url):
3228 mobj = re.match(self._VALID_URL, url)
3230 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3232 video_id = mobj.group(1)
3234 self.report_webpage(video_id)
3236 # Get webpage content
3238 webpage_bytes = compat_urllib_request.urlopen(url).read()
3239 webpage = webpage_bytes.decode('utf-8')
3240 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3241 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3244 result = re.search(self.VIDEO_URL_RE, webpage)
3246 self._downloader.trouble(u'ERROR: unable to extract video url')
3248 video_url = compat_urllib_parse.unquote(result.group(1))
3250 result = re.search(self.VIDEO_TITLE_RE, webpage)
3252 self._downloader.trouble(u'ERROR: unable to extract video title')
3254 video_title = result.group(1)
3256 result = re.search(self.VIDEO_THUMB_RE, webpage)
3258 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3260 video_thumbnail = result.group(1)
3266 'upload_date': None,
3267 'title': video_title,
3269 'thumbnail': video_thumbnail,
3270 'description': None,
3274 class GooglePlusIE(InfoExtractor):
3275 """Information extractor for plus.google.com."""
3277 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3278 IE_NAME = u'plus.google'
3280 def __init__(self, downloader=None):
3281 InfoExtractor.__init__(self, downloader)
3283 def report_extract_entry(self, url):
3284 """Report downloading extry"""
3285 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3287 def report_date(self, upload_date):
3288 """Report downloading extry"""
3289 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3291 def report_uploader(self, uploader):
3292 """Report downloading extry"""
3293 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3295 def report_title(self, video_title):
3296 """Report downloading extry"""
3297 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3299 def report_extract_vid_page(self, video_page):
3300 """Report information extraction."""
3301 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3303 def _real_extract(self, url):
3304 # Extract id from URL
3305 mobj = re.match(self._VALID_URL, url)
3307 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3310 post_url = mobj.group(0)
3311 video_id = mobj.group(1)
3313 video_extension = 'flv'
3315 # Step 1, Retrieve post webpage to extract further information
3316 self.report_extract_entry(post_url)
3317 request = compat_urllib_request.Request(post_url)
3319 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3320 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3321 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3324 # Extract update date
3326 pattern = 'title="Timestamp">(.*?)</a>'
3327 mobj = re.search(pattern, webpage)
3329 upload_date = mobj.group(1)
3330 # Convert timestring to a format suitable for filename
3331 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3332 upload_date = upload_date.strftime('%Y%m%d')
3333 self.report_date(upload_date)
3337 pattern = r'rel\="author".*?>(.*?)</a>'
3338 mobj = re.search(pattern, webpage)
3340 uploader = mobj.group(1)
3341 self.report_uploader(uploader)
3344 # Get the first line for title
3346 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3347 mobj = re.search(pattern, webpage)
3349 video_title = mobj.group(1)
3350 self.report_title(video_title)
3352 # Step 2, Stimulate clicking the image box to launch video
3353 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3354 mobj = re.search(pattern, webpage)
3356 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3358 video_page = mobj.group(1)
3359 request = compat_urllib_request.Request(video_page)
3361 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3362 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3363 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3365 self.report_extract_vid_page(video_page)
3368 # Extract video links on video page
3369 """Extract video links of all sizes"""
3370 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3371 mobj = re.findall(pattern, webpage)
3373 self._downloader.trouble(u'ERROR: unable to extract video links')
3375 # Sort in resolution
3376 links = sorted(mobj)
3378 # Choose the lowest of the sort, i.e. highest resolution
3379 video_url = links[-1]
3380 # Only get the url. The resolution part in the tuple has no use anymore
3381 video_url = video_url[-1]
3382 # Treat escaped \u0026 style hex
3384 video_url = video_url.decode("unicode_escape")
3385 except AttributeError: # Python 3
3386 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3392 'uploader': uploader,
3393 'upload_date': upload_date,
3394 'title': video_title,
3395 'ext': video_extension,
3398 class NBAIE(InfoExtractor):
3399 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3402 def _real_extract(self, url):
3403 mobj = re.match(self._VALID_URL, url)
3405 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3408 video_id = mobj.group(1)
3409 if video_id.endswith('/index.html'):
3410 video_id = video_id[:-len('/index.html')]
3412 webpage = self._download_webpage(url, video_id)
3414 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3415 def _findProp(rexp, default=None):
3416 m = re.search(rexp, webpage)
3418 return unescapeHTML(m.group(1))
3422 shortened_video_id = video_id.rpartition('/')[2]
3423 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3425 'id': shortened_video_id,
3429 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3430 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3434 class JustinTVIE(InfoExtractor):
3435 """Information extractor for justin.tv and twitch.tv"""
3436 # TODO: One broadcast may be split into multiple videos. The key
3437 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3438 # starts at 1 and increases. Can we treat all parts as one video?
3440 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3441 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3442 _JUSTIN_PAGE_LIMIT = 100
3443 IE_NAME = u'justin.tv'
3445 def report_extraction(self, file_id):
3446 """Report information extraction."""
3447 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3449 def report_download_page(self, channel, offset):
3450 """Report attempt to download a single page of videos."""
3451 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3452 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3454 # Return count of items, list of *valid* items
3455 def _parse_page(self, url):
3457 urlh = compat_urllib_request.urlopen(url)
3458 webpage_bytes = urlh.read()
3459 webpage = webpage_bytes.decode('utf-8', 'ignore')
3460 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3461 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3464 response = json.loads(webpage)
3465 if type(response) != list:
3466 error_text = response.get('error', 'unknown error')
3467 self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3470 for clip in response:
3471 video_url = clip['video_file_url']
3473 video_extension = os.path.splitext(video_url)[1][1:]
3474 video_date = re.sub('-', '', clip['start_time'][:10])
3475 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3476 video_id = clip['id']
3477 video_title = clip.get('title', video_id)
3481 'title': video_title,
3482 'uploader': clip.get('channel_name', video_uploader_id),
3483 'uploader_id': video_uploader_id,
3484 'upload_date': video_date,
3485 'ext': video_extension,
3487 return (len(response), info)
3489 def _real_extract(self, url):
3490 mobj = re.match(self._VALID_URL, url)
3492 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3495 api = 'http://api.justin.tv'
3496 video_id = mobj.group(mobj.lastindex)
3498 if mobj.lastindex == 1:
3500 api += '/channel/archives/%s.json'
3502 api += '/broadcast/by_archive/%s.json'
3503 api = api % (video_id,)
3505 self.report_extraction(video_id)
3509 limit = self._JUSTIN_PAGE_LIMIT
3512 self.report_download_page(video_id, offset)
3513 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3514 page_count, page_info = self._parse_page(page_url)
3515 info.extend(page_info)
3516 if not paged or page_count != limit:
3521 class FunnyOrDieIE(InfoExtractor):
3522 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3524 def _real_extract(self, url):
3525 mobj = re.match(self._VALID_URL, url)
3527 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3530 video_id = mobj.group('id')
3531 webpage = self._download_webpage(url, video_id)
3533 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3535 self._downloader.trouble(u'ERROR: unable to find video information')
3536 video_url = unescapeHTML(m.group('url'))
3538 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3540 self._downloader.trouble(u'Cannot find video title')
3541 title = unescapeHTML(m.group('title'))
3543 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3545 desc = unescapeHTML(m.group('desc'))
3554 'description': desc,
3558 class TweetReelIE(InfoExtractor):
3559 _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3561 def _real_extract(self, url):
3562 mobj = re.match(self._VALID_URL, url)
3564 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3567 video_id = mobj.group('id')
3568 webpage = self._download_webpage(url, video_id)
3570 m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3572 self._downloader.trouble(u'ERROR: Cannot find status ID')
3573 status_id = m.group(1)
3575 m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3577 self._downloader.trouble(u'WARNING: Cannot find description')
3578 desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3580 m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3582 self._downloader.trouble(u'ERROR: Cannot find uploader')
3583 uploader = unescapeHTML(m.group('uploader'))
3584 uploader_id = unescapeHTML(m.group('uploader_id'))
3586 m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3588 self._downloader.trouble(u'ERROR: Cannot find upload date')
3589 upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3592 video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3599 'description': desc,
3600 'uploader': uploader,
3601 'uploader_id': uploader_id,
3602 'internal_id': status_id,
3603 'upload_date': upload_date
3607 class SteamIE(InfoExtractor):
3608 _VALID_URL = r"""http://store.steampowered.com/
3609 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3611 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3614 def suitable(self, url):
3615 """Receives a URL and returns True if suitable for this IE."""
3616 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3618 def _real_extract(self, url):
3619 m = re.match(self._VALID_URL, url, re.VERBOSE)
3620 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3621 gameID = m.group('gameID')
3622 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3623 webpage = self._download_webpage(videourl, gameID)
3624 mweb = re.finditer(urlRE, webpage)
3625 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3626 titles = re.finditer(namesRE, webpage)
3628 for vid,vtitle in zip(mweb,titles):
3629 video_id = vid.group('videoID')
3630 title = vtitle.group('videoName')
3631 video_url = vid.group('videoURL')
3633 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3638 'title': unescapeHTML(title)
3643 class UstreamIE(InfoExtractor):
3644 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3645 IE_NAME = u'ustream'
3647 def _real_extract(self, url):
3648 m = re.match(self._VALID_URL, url)
3649 video_id = m.group('videoID')
3650 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3651 webpage = self._download_webpage(url, video_id)
3652 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3653 title = m.group('title')
3654 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3655 uploader = m.group('uploader')
3661 'uploader': uploader
3665 class RBMARadioIE(InfoExtractor):
3666 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3668 def _real_extract(self, url):
3669 m = re.match(self._VALID_URL, url)
3670 video_id = m.group('videoID')
3672 webpage = self._download_webpage(url, video_id)
3673 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3675 raise ExtractorError(u'Cannot find metadata')
3676 json_data = m.group(1)
3679 data = json.loads(json_data)
3680 except ValueError as e:
3681 raise ExtractorError(u'Invalid JSON: ' + str(e))
3683 video_url = data['akamai_url'] + '&cbr=256'
3684 url_parts = compat_urllib_parse_urlparse(video_url)
3685 video_ext = url_parts.path.rpartition('.')[2]
3690 'title': data['title'],
3691 'description': data.get('teaser_text'),
3692 'location': data.get('country_of_origin'),
3693 'uploader': data.get('host', {}).get('name'),
3694 'uploader_id': data.get('host', {}).get('slug'),
3695 'thumbnail': data.get('image', {}).get('large_url_2x'),
3696 'duration': data.get('duration'),
3701 class YouPornIE(InfoExtractor):
3702 """Information extractor for youporn.com."""
3703 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3705 def _print_formats(self, formats):
3706 """Print all available formats"""
3707 print(u'Available formats:')
3708 print(u'ext\t\tformat')
3709 print(u'---------------------------------')
3710 for format in formats:
3711 print(u'%s\t\t%s' % (format['ext'], format['format']))
3713 def _specific(self, req_format, formats):
3715 if(x["format"]==req_format):
3719 def _real_extract(self, url):
3720 mobj = re.match(self._VALID_URL, url)
3722 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3725 video_id = mobj.group('videoid')
3727 req = compat_urllib_request.Request(url)
3728 req.add_header('Cookie', 'age_verified=1')
3729 webpage = self._download_webpage(req, video_id)
3731 # Get the video title
3732 result = re.search(r'videoTitleArea">(?P<title>.*)</h1>', webpage)
3734 raise ExtractorError(u'ERROR: unable to extract video title')
3735 video_title = result.group('title').strip()
3737 # Get the video date
3738 result = re.search(r'Date:</b>(?P<date>.*)</li>', webpage)
3740 self._downloader.to_stderr(u'WARNING: unable to extract video date')
3743 upload_date = result.group('date').strip()
3745 # Get the video uploader
3746 result = re.search(r'Submitted:</b>(?P<uploader>.*)</li>', webpage)
3748 self._downloader.to_stderr(u'ERROR: unable to extract uploader')
3749 video_uploader = None
3751 video_uploader = result.group('uploader').strip()
3752 video_uploader = clean_html( video_uploader )
3754 # Get all of the formats available
3755 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3756 result = re.search(DOWNLOAD_LIST_RE, webpage)
3758 raise ExtractorError(u'Unable to extract download list')
3759 download_list_html = result.group('download_list').strip()
3761 # Get all of the links from the page
3762 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3763 links = re.findall(LINK_RE, download_list_html)
3764 if(len(links) == 0):
3765 raise ExtractorError(u'ERROR: no known formats available for video')
3767 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3772 # A link looks like this:
3773 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3774 # A path looks like this:
3775 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3776 video_url = unescapeHTML( link )
3777 path = compat_urllib_parse_urlparse( video_url ).path
3778 extension = os.path.splitext( path )[1][1:]
3779 format = path.split('/')[4].split('_')[:2]
3782 format = "-".join( format )
3783 title = u'%s-%s-%s' % (video_title, size, bitrate)
3788 'uploader': video_uploader,
3789 'upload_date': upload_date,
3794 'description': None,
3798 if self._downloader.params.get('listformats', None):
3799 self._print_formats(formats)
3802 req_format = self._downloader.params.get('format', None)
3803 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3805 if req_format is None or req_format == 'best':
3807 elif req_format == 'worst':
3808 return [formats[-1]]
3809 elif req_format in ('-1', 'all'):
3812 format = self._specific( req_format, formats )
3814 self._downloader.trouble(u'ERROR: requested format not available')
3820 class PornotubeIE(InfoExtractor):
3821 """Information extractor for pornotube.com."""
3822 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3824 def _real_extract(self, url):
3825 mobj = re.match(self._VALID_URL, url)
3827 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3830 video_id = mobj.group('videoid')
3831 video_title = mobj.group('title')
3833 # Get webpage content
3834 webpage = self._download_webpage(url, video_id)
3837 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3838 result = re.search(VIDEO_URL_RE, webpage)
3840 self._downloader.trouble(u'ERROR: unable to extract video url')
3842 video_url = compat_urllib_parse.unquote(result.group('url'))
3844 #Get the uploaded date
3845 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3846 result = re.search(VIDEO_UPLOADED_RE, webpage)
3848 self._downloader.trouble(u'ERROR: unable to extract video title')
3850 upload_date = result.group('date')
3852 info = {'id': video_id,
3855 'upload_date': upload_date,
3856 'title': video_title,
3862 class YouJizzIE(InfoExtractor):
3863 """Information extractor for youjizz.com."""
3864 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3866 def _real_extract(self, url):
3867 mobj = re.match(self._VALID_URL, url)
3869 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3872 video_id = mobj.group('videoid')
3874 # Get webpage content
3875 webpage = self._download_webpage(url, video_id)
3877 # Get the video title
3878 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3880 raise ExtractorError(u'ERROR: unable to extract video title')
3881 video_title = result.group('title').strip()
3883 # Get the embed page
3884 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3886 raise ExtractorError(u'ERROR: unable to extract embed page')
3888 embed_page_url = result.group(0).strip()
3889 video_id = result.group('videoid')
3891 webpage = self._download_webpage(embed_page_url, video_id)
3894 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3896 raise ExtractorError(u'ERROR: unable to extract video url')
3897 video_url = result.group('source')
3899 info = {'id': video_id,
3901 'title': video_title,
3904 'player_url': embed_page_url}
3908 class EightTracksIE(InfoExtractor):
3910 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3912 def _real_extract(self, url):
3913 mobj = re.match(self._VALID_URL, url)
3915 raise ExtractorError(u'Invalid URL: %s' % url)
3916 playlist_id = mobj.group('id')
3918 webpage = self._download_webpage(url, playlist_id)
3920 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3922 raise ExtractorError(u'Cannot find trax information')
3923 json_like = m.group(1)
3924 data = json.loads(json_like)
3926 session = str(random.randint(0, 1000000000))
3928 track_count = data['tracks_count']
3929 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3930 next_url = first_url
3932 for i in itertools.count():
3933 api_json = self._download_webpage(next_url, playlist_id,
3934 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3935 errnote=u'Failed to download song information')
3936 api_data = json.loads(api_json)
3937 track_data = api_data[u'set']['track']
3939 'id': track_data['id'],
3940 'url': track_data['track_file_stream_url'],
3941 'title': track_data['performer'] + u' - ' + track_data['name'],
3942 'raw_title': track_data['name'],
3943 'uploader_id': data['user']['login'],
3947 if api_data['set']['at_last_track']:
3949 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3952 class KeekIE(InfoExtractor):
3953 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3956 def _real_extract(self, url):
3957 m = re.match(self._VALID_URL, url)
3958 video_id = m.group('videoID')
3959 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3960 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3961 webpage = self._download_webpage(url, video_id)
3962 m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
3963 title = unescapeHTML(m.group('title'))
3964 m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
3965 uploader = unescapeHTML(m.group('uploader'))
3971 'thumbnail': thumbnail,
3972 'uploader': uploader
3976 class TEDIE(InfoExtractor):
3977 _VALID_URL=r'''http://www.ted.com/
3979 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3981 ((?P<type_talk>talks)) # We have a simple talk
3983 /(?P<name>\w+) # Here goes the name and then ".html"
3986 def suitable(self, url):
3987 """Receives a URL and returns True if suitable for this IE."""
3988 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3990 def _real_extract(self, url):
3991 m=re.match(self._VALID_URL, url, re.VERBOSE)
3992 if m.group('type_talk'):
3993 return [self._talk_info(url)]
3995 playlist_id=m.group('playlist_id')
3996 name=m.group('name')
3997 self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
3998 return self._playlist_videos_info(url,name,playlist_id)
4000 def _talk_video_link(self,mediaSlug):
4001 '''Returns the video link for that mediaSlug'''
4002 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
4004 def _playlist_videos_info(self,url,name,playlist_id=0):
4005 '''Returns the videos of the playlist'''
4007 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4008 ([.\s]*?)data-playlist_item_id="(\d+)"
4009 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4011 video_name_RE=r'<p\ class="talk-title"><a href="/talks/(.+).html">(?P<fullname>.+?)</a></p>'
4012 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4013 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4014 m_names=re.finditer(video_name_RE,webpage)
4016 for m_video, m_name in zip(m_videos,m_names):
4018 'id': m_video.group('video_id'),
4019 'url': self._talk_video_link(m_video.group('mediaSlug')),
4021 'title': m_name.group('fullname')
4023 info.append(video_dic)
4025 def _talk_info(self, url, video_id=0):
4026 """Return the video for the talk in the url"""
4027 m=re.match(self._VALID_URL, url,re.VERBOSE)
4028 videoName=m.group('name')
4029 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4030 # If the url includes the language we get the title translated
4031 title_RE=r'<h1><span id="altHeadline" >(?P<title>[\s\w:/\.\?=\+-\\\']*)</span></h1>'
4032 title=re.search(title_RE, webpage).group('title')
4033 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4034 "id":(?P<videoID>[\d]+).*?
4035 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4036 info_match=re.search(info_RE,webpage,re.VERBOSE)
4037 video_id=info_match.group('videoID')
4038 mediaSlug=info_match.group('mediaSlug')
4039 video_url=self._talk_video_link(mediaSlug)
4048 class MySpassIE(InfoExtractor):
4049 _VALID_URL = r'http://www.myspass.de/.*'
4051 def _real_extract(self, url):
4052 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4054 # video id is the last path element of the URL
4055 # usually there is a trailing slash, so also try the second but last
4056 url_path = compat_urllib_parse_urlparse(url).path
4057 url_parent_path, video_id = os.path.split(url_path)
4059 _, video_id = os.path.split(url_parent_path)
4062 metadata_url = META_DATA_URL_TEMPLATE % video_id
4063 metadata_text = self._download_webpage(metadata_url, video_id)
4064 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4066 # extract values from metadata
4067 url_flv_el = metadata.find('url_flv')
4068 if url_flv_el is None:
4069 self._downloader.trouble(u'ERROR: unable to extract download url')
4071 video_url = url_flv_el.text
4072 extension = os.path.splitext(video_url)[1][1:]
4073 title_el = metadata.find('title')
4074 if title_el is None:
4075 self._downloader.trouble(u'ERROR: unable to extract title')
4077 title = title_el.text
4078 format_id_el = metadata.find('format_id')
4079 if format_id_el is None:
4082 format = format_id_el.text
4083 description_el = metadata.find('description')
4084 if description_el is not None:
4085 description = description_el.text
4088 imagePreview_el = metadata.find('imagePreview')
4089 if imagePreview_el is not None:
4090 thumbnail = imagePreview_el.text
4099 'thumbnail': thumbnail,
4100 'description': description
4104 def gen_extractors():
4105 """ Return a list of an instance of every supported extractor.
4106 The order does matter; the first extractor matched is the one handling the URL.
4109 YoutubePlaylistIE(),
4133 StanfordOpenClassroomIE(),