2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
22 class InfoExtractor(object):
23 """Information Extractor class.
25 Information extractors are the classes that, given a URL, extract
26 information about the video (or videos) the URL refers to. This
27 information includes the real video URL, the video title, author and
28 others. The information is stored in a dictionary which is then
29 passed to the FileDownloader. The FileDownloader processes this
30 information possibly downloading the video to the file system, among
31 other possible outcomes.
33 The dictionaries must include the following fields:
37 title: Video title, unescaped.
38 ext: Video filename extension.
40 The following fields are optional:
42 format: The video format, defaults to ext (used for --get-format)
43 thumbnail: Full URL to a video thumbnail image.
44 description: One-line video description.
45 uploader: Full name of the video uploader.
46 upload_date: Video upload date (YYYYMMDD).
47 uploader_id: Nickname or id of the video uploader.
48 location: Physical location of the video.
49 player_url: SWF Player URL (used for rtmpdump).
50 subtitles: The .srt file contents.
51 urlhandle: [internal] The urlHandle to be used to download the file,
52 like returned by urllib.request.urlopen
54 The fields should all be Unicode strings.
56 Subclasses of this one should re-define the _real_initialize() and
57 _real_extract() methods and define a _VALID_URL regexp.
58 Probably, they should also be added to the list of extractors.
60 _real_extract() must return a *list* of information dictionaries as
63 Finally, the _WORKING attribute should be set to False for broken IEs
64 in order to warn the users and skip the tests.
71 def __init__(self, downloader=None):
72 """Constructor. Receives an optional downloader."""
74 self.set_downloader(downloader)
76 def suitable(self, url):
77 """Receives a URL and returns True if suitable for this IE."""
78 return re.match(self._VALID_URL, url) is not None
81 """Getter method for _WORKING."""
85 """Initializes an instance (authentication, etc)."""
87 self._real_initialize()
90 def extract(self, url):
91 """Extracts URL information and returns it in list of dicts."""
93 return self._real_extract(url)
95 def set_downloader(self, downloader):
96 """Sets the downloader for this IE."""
97 self._downloader = downloader
99 def _real_initialize(self):
100 """Real initialization process. Redefine in subclasses."""
103 def _real_extract(self, url):
104 """Real extraction process. Redefine in subclasses."""
109 return type(self).__name__[:-2]
111 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
112 """ Returns the response handle """
114 note = u'Downloading video webpage'
115 self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
117 return compat_urllib_request.urlopen(url_or_request)
118 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
120 errnote = u'Unable to download webpage'
121 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
123 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
124 """ Returns the data of the page as a string """
125 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
126 webpage_bytes = urlh.read()
127 return webpage_bytes.decode('utf-8', 'replace')
130 class YoutubeIE(InfoExtractor):
131 """Information extractor for youtube.com."""
135 (?:https?://)? # http(s):// (optional)
136 (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/|
137 tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains
138 (?:.*?\#/)? # handle anchor (#/) redirect urls
139 (?!view_play_list|my_playlists|artist|playlist) # ignore playlist URLs
140 (?: # the various things that can precede the ID:
141 (?:(?:v|embed|e)/) # v/ or embed/ or e/
142 |(?: # or the v= param in all its forms
143 (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
144 (?:\?|\#!?) # the params delimiter ? or # or #!
145 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
148 )? # optional -> youtube.com/xxxx is OK
149 )? # all until now is optional -> you can pass the naked ID
150 ([0-9A-Za-z_-]+) # here is it! the YouTube video ID
151 (?(1).+)? # if we found the ID, everything can follow
153 _LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
154 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
155 _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
156 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
157 _NETRC_MACHINE = 'youtube'
158 # Listed in order of quality
159 _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
160 _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
161 _video_extensions = {
167 '38': 'video', # You actually don't know if this will be MOV, AVI or whatever
173 _video_dimensions = {
191 def suitable(self, url):
192 """Receives a URL and returns True if suitable for this IE."""
193 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
195 def report_lang(self):
196 """Report attempt to set language."""
197 self._downloader.to_screen(u'[youtube] Setting language')
199 def report_login(self):
200 """Report attempt to log in."""
201 self._downloader.to_screen(u'[youtube] Logging in')
203 def report_age_confirmation(self):
204 """Report attempt to confirm age."""
205 self._downloader.to_screen(u'[youtube] Confirming age')
207 def report_video_webpage_download(self, video_id):
208 """Report attempt to download video webpage."""
209 self._downloader.to_screen(u'[youtube] %s: Downloading video webpage' % video_id)
211 def report_video_info_webpage_download(self, video_id):
212 """Report attempt to download video info webpage."""
213 self._downloader.to_screen(u'[youtube] %s: Downloading video info webpage' % video_id)
215 def report_video_subtitles_download(self, video_id):
216 """Report attempt to download video info webpage."""
217 self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
219 def report_information_extraction(self, video_id):
220 """Report attempt to extract video information."""
221 self._downloader.to_screen(u'[youtube] %s: Extracting video information' % video_id)
223 def report_unavailable_format(self, video_id, format):
224 """Report extracted video URL."""
225 self._downloader.to_screen(u'[youtube] %s: Format %s not available' % (video_id, format))
227 def report_rtmp_download(self):
228 """Indicate the download will use the RTMP protocol."""
229 self._downloader.to_screen(u'[youtube] RTMP download detected')
232 def _extract_subtitles(self, video_id):
233 self.report_video_subtitles_download(video_id)
234 request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
236 srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
237 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
238 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
239 srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
240 srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
241 if not srt_lang_list:
242 return (u'WARNING: video has no closed captions', None)
243 if self._downloader.params.get('subtitleslang', False):
244 srt_lang = self._downloader.params.get('subtitleslang')
245 elif 'en' in srt_lang_list:
248 srt_lang = list(srt_lang_list.keys())[0]
249 if not srt_lang in srt_lang_list:
250 return (u'WARNING: no closed captions found in the specified language "%s"' % srt_lang, None)
251 params = compat_urllib_parse.urlencode({
253 'name': srt_lang_list[srt_lang].encode('utf-8'),
257 url = 'http://www.youtube.com/api/timedtext?' + params
259 srt = compat_urllib_request.urlopen(url).read().decode('utf-8')
260 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
261 return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
263 return (u'WARNING: Did not fetch video subtitles', None)
266 def _print_formats(self, formats):
267 print('Available formats:')
269 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
271 def _real_initialize(self):
272 if self._downloader is None:
277 downloader_params = self._downloader.params
279 # Attempt to use provided username and password or .netrc data
280 if downloader_params.get('username', None) is not None:
281 username = downloader_params['username']
282 password = downloader_params['password']
283 elif downloader_params.get('usenetrc', False):
285 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
290 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
291 except (IOError, netrc.NetrcParseError) as err:
292 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
296 request = compat_urllib_request.Request(self._LANG_URL)
299 compat_urllib_request.urlopen(request).read()
300 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
301 self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
304 # No authentication to be performed
308 request = compat_urllib_request.Request(self._LOGIN_URL)
310 login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
311 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
312 self._downloader.to_stderr(u'WARNING: unable to fetch login page: %s' % compat_str(err))
317 match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
319 galx = match.group(1)
321 match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
327 u'continue': u'http://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
331 u'PersistentCookie': u'yes',
333 u'bgresponse': u'js_disabled',
334 u'checkConnection': u'',
335 u'checkedDomains': u'youtube',
341 u'signIn': u'Sign in',
343 u'service': u'youtube',
347 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
349 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
350 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
351 request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
354 login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
355 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
356 self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
358 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
359 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
365 'action_confirm': 'Confirm',
367 request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
369 self.report_age_confirmation()
370 age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
371 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
372 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
375 def _extract_id(self, url):
376 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
378 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
380 video_id = mobj.group(2)
383 def _real_extract(self, url):
384 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
385 mobj = re.search(self._NEXT_URL_RE, url)
387 url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
388 video_id = self._extract_id(url)
391 self.report_video_webpage_download(video_id)
392 url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
393 request = compat_urllib_request.Request(url)
395 video_webpage_bytes = compat_urllib_request.urlopen(request).read()
396 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
397 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
400 video_webpage = video_webpage_bytes.decode('utf-8', 'ignore')
402 # Attempt to extract SWF player URL
403 mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
405 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
410 self.report_video_info_webpage_download(video_id)
411 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
412 video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
413 % (video_id, el_type))
414 request = compat_urllib_request.Request(video_info_url)
416 video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
417 video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
418 video_info = compat_parse_qs(video_info_webpage)
419 if 'token' in video_info:
421 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
422 self._downloader.trouble(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
424 if 'token' not in video_info:
425 if 'reason' in video_info:
426 self._downloader.trouble(u'ERROR: YouTube said: %s' % video_info['reason'][0])
428 self._downloader.trouble(u'ERROR: "token" parameter not in video info for unknown reason')
431 # Check for "rental" videos
432 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
433 self._downloader.trouble(u'ERROR: "rental" videos not supported')
436 # Start extracting information
437 self.report_information_extraction(video_id)
440 if 'author' not in video_info:
441 self._downloader.trouble(u'ERROR: unable to extract uploader name')
443 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
446 video_uploader_id = None
447 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
449 video_uploader_id = mobj.group(1)
451 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
454 if 'title' not in video_info:
455 self._downloader.trouble(u'ERROR: unable to extract video title')
457 video_title = compat_urllib_parse.unquote_plus(video_info['title'][0])
460 if 'thumbnail_url' not in video_info:
461 self._downloader.trouble(u'WARNING: unable to extract video thumbnail')
463 else: # don't panic if we can't find it
464 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
468 mobj = re.search(r'id="eow-date.*?>(.*?)</span>', video_webpage, re.DOTALL)
470 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
471 format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
472 for expression in format_expressions:
474 upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
479 video_description = get_element_by_id("eow-description", video_webpage)
480 if video_description:
481 video_description = clean_html(video_description)
483 video_description = ''
486 video_subtitles = None
487 if self._downloader.params.get('subtitleslang', False):
488 self._downloader.params['writesubtitles'] = True
489 if self._downloader.params.get('onlysubtitles', False):
490 self._downloader.params['writesubtitles'] = True
491 if self._downloader.params.get('writesubtitles', False):
492 (srt_error, video_subtitles) = self._extract_subtitles(video_id)
494 self._downloader.trouble(srt_error)
496 if 'length_seconds' not in video_info:
497 self._downloader.trouble(u'WARNING: unable to extract video duration')
500 video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
503 video_token = compat_urllib_parse.unquote_plus(video_info['token'][0])
505 # Decide which formats to download
506 req_format = self._downloader.params.get('format', None)
508 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
509 self.report_rtmp_download()
510 video_url_list = [(None, video_info['conn'][0])]
511 elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
512 url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
513 url_data = [compat_parse_qs(uds) for uds in url_data_strs]
514 url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
515 url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
517 format_limit = self._downloader.params.get('format_limit', None)
518 available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
519 if format_limit is not None and format_limit in available_formats:
520 format_list = available_formats[available_formats.index(format_limit):]
522 format_list = available_formats
523 existing_formats = [x for x in format_list if x in url_map]
524 if len(existing_formats) == 0:
525 self._downloader.trouble(u'ERROR: no known formats available for video')
527 if self._downloader.params.get('listformats', None):
528 self._print_formats(existing_formats)
530 if req_format is None or req_format == 'best':
531 video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
532 elif req_format == 'worst':
533 video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality
534 elif req_format in ('-1', 'all'):
535 video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
537 # Specific formats. We pick the first in a slash-delimeted sequence.
538 # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
539 req_formats = req_format.split('/')
540 video_url_list = None
541 for rf in req_formats:
543 video_url_list = [(rf, url_map[rf])]
545 if video_url_list is None:
546 self._downloader.trouble(u'ERROR: requested format not available')
549 self._downloader.trouble(u'ERROR: no conn or url_encoded_fmt_stream_map information found in video info')
553 for format_param, video_real_url in video_url_list:
555 video_extension = self._video_extensions.get(format_param, 'flv')
557 video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
558 self._video_dimensions.get(format_param, '???'))
562 'url': video_real_url,
563 'uploader': video_uploader,
564 'uploader_id': video_uploader_id,
565 'upload_date': upload_date,
566 'title': video_title,
567 'ext': video_extension,
568 'format': video_format,
569 'thumbnail': video_thumbnail,
570 'description': video_description,
571 'player_url': player_url,
572 'subtitles': video_subtitles,
573 'duration': video_duration
578 class MetacafeIE(InfoExtractor):
579 """Information Extractor for metacafe.com."""
581 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
582 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
583 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
584 IE_NAME = u'metacafe'
586 def __init__(self, downloader=None):
587 InfoExtractor.__init__(self, downloader)
589 def report_disclaimer(self):
590 """Report disclaimer retrieval."""
591 self._downloader.to_screen(u'[metacafe] Retrieving disclaimer')
593 def report_age_confirmation(self):
594 """Report attempt to confirm age."""
595 self._downloader.to_screen(u'[metacafe] Confirming age')
597 def report_download_webpage(self, video_id):
598 """Report webpage download."""
599 self._downloader.to_screen(u'[metacafe] %s: Downloading webpage' % video_id)
601 def report_extraction(self, video_id):
602 """Report information extraction."""
603 self._downloader.to_screen(u'[metacafe] %s: Extracting information' % video_id)
605 def _real_initialize(self):
606 # Retrieve disclaimer
607 request = compat_urllib_request.Request(self._DISCLAIMER)
609 self.report_disclaimer()
610 disclaimer = compat_urllib_request.urlopen(request).read()
611 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
612 self._downloader.trouble(u'ERROR: unable to retrieve disclaimer: %s' % compat_str(err))
618 'submit': "Continue - I'm over 18",
620 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
622 self.report_age_confirmation()
623 disclaimer = compat_urllib_request.urlopen(request).read()
624 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
625 self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
628 def _real_extract(self, url):
629 # Extract id and simplified title from URL
630 mobj = re.match(self._VALID_URL, url)
632 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
635 video_id = mobj.group(1)
637 # Check if video comes from YouTube
638 mobj2 = re.match(r'^yt-(.*)$', video_id)
639 if mobj2 is not None:
640 self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
643 # Retrieve video webpage to extract further information
644 request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
646 self.report_download_webpage(video_id)
647 webpage = compat_urllib_request.urlopen(request).read()
648 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
649 self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
652 # Extract URL, uploader and title from webpage
653 self.report_extraction(video_id)
654 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
656 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
657 video_extension = mediaURL[-3:]
659 # Extract gdaKey if available
660 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
664 gdaKey = mobj.group(1)
665 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
667 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
669 self._downloader.trouble(u'ERROR: unable to extract media URL')
671 vardict = compat_parse_qs(mobj.group(1))
672 if 'mediaData' not in vardict:
673 self._downloader.trouble(u'ERROR: unable to extract media URL')
675 mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
677 self._downloader.trouble(u'ERROR: unable to extract media URL')
679 mediaURL = mobj.group(1).replace('\\/', '/')
680 video_extension = mediaURL[-3:]
681 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
683 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
685 self._downloader.trouble(u'ERROR: unable to extract title')
687 video_title = mobj.group(1).decode('utf-8')
689 mobj = re.search(r'submitter=(.*?);', webpage)
691 self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
693 video_uploader = mobj.group(1)
696 'id': video_id.decode('utf-8'),
697 'url': video_url.decode('utf-8'),
698 'uploader': video_uploader.decode('utf-8'),
700 'title': video_title,
701 'ext': video_extension.decode('utf-8'),
705 class DailymotionIE(InfoExtractor):
706 """Information Extractor for Dailymotion"""
708 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
709 IE_NAME = u'dailymotion'
712 def __init__(self, downloader=None):
713 InfoExtractor.__init__(self, downloader)
715 def report_extraction(self, video_id):
716 """Report information extraction."""
717 self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
719 def _real_extract(self, url):
720 # Extract id and simplified title from URL
721 mobj = re.match(self._VALID_URL, url)
723 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
726 video_id = mobj.group(1).split('_')[0].split('?')[0]
728 video_extension = 'mp4'
730 # Retrieve video webpage to extract further information
731 request = compat_urllib_request.Request(url)
732 request.add_header('Cookie', 'family_filter=off')
733 webpage = self._download_webpage(request, video_id)
735 # Extract URL, uploader and title from webpage
736 self.report_extraction(video_id)
737 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
739 self._downloader.trouble(u'ERROR: unable to extract media URL')
741 flashvars = compat_urllib_parse.unquote(mobj.group(1))
743 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
746 self._downloader.to_screen(u'[dailymotion] Using %s' % key)
749 self._downloader.trouble(u'ERROR: unable to extract video URL')
752 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
754 self._downloader.trouble(u'ERROR: unable to extract video URL')
757 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
759 # TODO: support choosing qualities
761 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
763 self._downloader.trouble(u'ERROR: unable to extract title')
765 video_title = unescapeHTML(mobj.group('title'))
767 video_uploader = None
768 mobj = re.search(r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>', webpage)
770 # lookin for official user
771 mobj_official = re.search(r'<span rel="author"[^>]+?>([^<]+?)</span>', webpage)
772 if mobj_official is None:
773 self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
775 video_uploader = mobj_official.group(1)
777 video_uploader = mobj.group(1)
779 video_upload_date = None
780 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
782 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
787 'uploader': video_uploader,
788 'upload_date': video_upload_date,
789 'title': video_title,
790 'ext': video_extension,
794 class PhotobucketIE(InfoExtractor):
795 """Information extractor for photobucket.com."""
797 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
798 IE_NAME = u'photobucket'
800 def __init__(self, downloader=None):
801 InfoExtractor.__init__(self, downloader)
803 def report_download_webpage(self, video_id):
804 """Report webpage download."""
805 self._downloader.to_screen(u'[photobucket] %s: Downloading webpage' % video_id)
807 def report_extraction(self, video_id):
808 """Report information extraction."""
809 self._downloader.to_screen(u'[photobucket] %s: Extracting information' % video_id)
811 def _real_extract(self, url):
812 # Extract id from URL
813 mobj = re.match(self._VALID_URL, url)
815 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
818 video_id = mobj.group(1)
820 video_extension = 'flv'
822 # Retrieve video webpage to extract further information
823 request = compat_urllib_request.Request(url)
825 self.report_download_webpage(video_id)
826 webpage = compat_urllib_request.urlopen(request).read()
827 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
828 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
831 # Extract URL, uploader, and title from webpage
832 self.report_extraction(video_id)
833 mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
835 self._downloader.trouble(u'ERROR: unable to extract media URL')
837 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
841 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
843 self._downloader.trouble(u'ERROR: unable to extract title')
845 video_title = mobj.group(1).decode('utf-8')
847 video_uploader = mobj.group(2).decode('utf-8')
850 'id': video_id.decode('utf-8'),
851 'url': video_url.decode('utf-8'),
852 'uploader': video_uploader,
854 'title': video_title,
855 'ext': video_extension.decode('utf-8'),
859 class YahooIE(InfoExtractor):
860 """Information extractor for video.yahoo.com."""
863 # _VALID_URL matches all Yahoo! Video URLs
864 # _VPAGE_URL matches only the extractable '/watch/' URLs
865 _VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
866 _VPAGE_URL = r'(?:http://)?video\.yahoo\.com/watch/([0-9]+)/([0-9]+)(?:[#\?].*)?'
867 IE_NAME = u'video.yahoo'
869 def __init__(self, downloader=None):
870 InfoExtractor.__init__(self, downloader)
872 def report_download_webpage(self, video_id):
873 """Report webpage download."""
874 self._downloader.to_screen(u'[video.yahoo] %s: Downloading webpage' % video_id)
876 def report_extraction(self, video_id):
877 """Report information extraction."""
878 self._downloader.to_screen(u'[video.yahoo] %s: Extracting information' % video_id)
880 def _real_extract(self, url, new_video=True):
881 # Extract ID from URL
882 mobj = re.match(self._VALID_URL, url)
884 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
887 video_id = mobj.group(2)
888 video_extension = 'flv'
890 # Rewrite valid but non-extractable URLs as
891 # extractable English language /watch/ URLs
892 if re.match(self._VPAGE_URL, url) is None:
893 request = compat_urllib_request.Request(url)
895 webpage = compat_urllib_request.urlopen(request).read()
896 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
897 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
900 mobj = re.search(r'\("id", "([0-9]+)"\);', webpage)
902 self._downloader.trouble(u'ERROR: Unable to extract id field')
904 yahoo_id = mobj.group(1)
906 mobj = re.search(r'\("vid", "([0-9]+)"\);', webpage)
908 self._downloader.trouble(u'ERROR: Unable to extract vid field')
910 yahoo_vid = mobj.group(1)
912 url = 'http://video.yahoo.com/watch/%s/%s' % (yahoo_vid, yahoo_id)
913 return self._real_extract(url, new_video=False)
915 # Retrieve video webpage to extract further information
916 request = compat_urllib_request.Request(url)
918 self.report_download_webpage(video_id)
919 webpage = compat_urllib_request.urlopen(request).read()
920 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
921 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
924 # Extract uploader and title from webpage
925 self.report_extraction(video_id)
926 mobj = re.search(r'<meta name="title" content="(.*)" />', webpage)
928 self._downloader.trouble(u'ERROR: unable to extract video title')
930 video_title = mobj.group(1).decode('utf-8')
932 mobj = re.search(r'<h2 class="ti-5"><a href="http://video\.yahoo\.com/(people|profile)/[0-9]+" beacon=".*">(.*)</a></h2>', webpage)
934 self._downloader.trouble(u'ERROR: unable to extract video uploader')
936 video_uploader = mobj.group(1).decode('utf-8')
938 # Extract video thumbnail
939 mobj = re.search(r'<link rel="image_src" href="(.*)" />', webpage)
941 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
943 video_thumbnail = mobj.group(1).decode('utf-8')
945 # Extract video description
946 mobj = re.search(r'<meta name="description" content="(.*)" />', webpage)
948 self._downloader.trouble(u'ERROR: unable to extract video description')
950 video_description = mobj.group(1).decode('utf-8')
951 if not video_description:
952 video_description = 'No description available.'
954 # Extract video height and width
955 mobj = re.search(r'<meta name="video_height" content="([0-9]+)" />', webpage)
957 self._downloader.trouble(u'ERROR: unable to extract video height')
959 yv_video_height = mobj.group(1)
961 mobj = re.search(r'<meta name="video_width" content="([0-9]+)" />', webpage)
963 self._downloader.trouble(u'ERROR: unable to extract video width')
965 yv_video_width = mobj.group(1)
967 # Retrieve video playlist to extract media URL
968 # I'm not completely sure what all these options are, but we
969 # seem to need most of them, otherwise the server sends a 401.
970 yv_lg = 'R0xx6idZnW2zlrKP8xxAIR' # not sure what this represents
971 yv_bitrate = '700' # according to Wikipedia this is hard-coded
972 request = compat_urllib_request.Request('http://cosmos.bcst.yahoo.com/up/yep/process/getPlaylistFOP.php?node_id=' + video_id +
973 '&tech=flash&mode=playlist&lg=' + yv_lg + '&bitrate=' + yv_bitrate + '&vidH=' + yv_video_height +
974 '&vidW=' + yv_video_width + '&swf=as3&rd=video.yahoo.com&tk=null&adsupported=v1,v2,&eventid=1301797')
976 self.report_download_webpage(video_id)
977 webpage = compat_urllib_request.urlopen(request).read()
978 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
979 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
982 # Extract media URL from playlist XML
983 mobj = re.search(r'<STREAM APP="(http://.*)" FULLPATH="/?(/.*\.flv\?[^"]*)"', webpage)
985 self._downloader.trouble(u'ERROR: Unable to extract media URL')
987 video_url = compat_urllib_parse.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
988 video_url = unescapeHTML(video_url)
991 'id': video_id.decode('utf-8'),
993 'uploader': video_uploader,
995 'title': video_title,
996 'ext': video_extension.decode('utf-8'),
997 'thumbnail': video_thumbnail.decode('utf-8'),
998 'description': video_description,
1002 class VimeoIE(InfoExtractor):
1003 """Information extractor for vimeo.com."""
1005 # _VALID_URL matches Vimeo URLs
1006 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
1009 def __init__(self, downloader=None):
1010 InfoExtractor.__init__(self, downloader)
1012 def report_download_webpage(self, video_id):
1013 """Report webpage download."""
1014 self._downloader.to_screen(u'[vimeo] %s: Downloading webpage' % video_id)
1016 def report_extraction(self, video_id):
1017 """Report information extraction."""
1018 self._downloader.to_screen(u'[vimeo] %s: Extracting information' % video_id)
1020 def _real_extract(self, url, new_video=True):
1021 # Extract ID from URL
1022 mobj = re.match(self._VALID_URL, url)
1024 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1027 video_id = mobj.group('id')
1028 if not mobj.group('proto'):
1029 url = 'https://' + url
1030 if mobj.group('direct_link'):
1031 url = 'https://vimeo.com/' + video_id
1033 # Retrieve video webpage to extract further information
1034 request = compat_urllib_request.Request(url, None, std_headers)
1036 self.report_download_webpage(video_id)
1037 webpage_bytes = compat_urllib_request.urlopen(request).read()
1038 webpage = webpage_bytes.decode('utf-8')
1039 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1040 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1043 # Now we begin extracting as much information as we can from what we
1044 # retrieved. First we extract the information common to all extractors,
1045 # and latter we extract those that are Vimeo specific.
1046 self.report_extraction(video_id)
1048 # Extract the config JSON
1050 config = webpage.split(' = {config:')[1].split(',assets:')[0]
1051 config = json.loads(config)
1053 self._downloader.trouble(u'ERROR: unable to extract info section')
1057 video_title = config["video"]["title"]
1059 # Extract uploader and uploader_id
1060 video_uploader = config["video"]["owner"]["name"]
1061 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
1063 # Extract video thumbnail
1064 video_thumbnail = config["video"]["thumbnail"]
1066 # Extract video description
1067 video_description = get_element_by_attribute("itemprop", "description", webpage)
1068 if video_description: video_description = clean_html(video_description)
1069 else: video_description = ''
1071 # Extract upload date
1072 video_upload_date = None
1073 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
1074 if mobj is not None:
1075 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
1077 # Vimeo specific: extract request signature and timestamp
1078 sig = config['request']['signature']
1079 timestamp = config['request']['timestamp']
1081 # Vimeo specific: extract video codec and quality information
1082 # First consider quality, then codecs, then take everything
1083 # TODO bind to format param
1084 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
1085 files = { 'hd': [], 'sd': [], 'other': []}
1086 for codec_name, codec_extension in codecs:
1087 if codec_name in config["video"]["files"]:
1088 if 'hd' in config["video"]["files"][codec_name]:
1089 files['hd'].append((codec_name, codec_extension, 'hd'))
1090 elif 'sd' in config["video"]["files"][codec_name]:
1091 files['sd'].append((codec_name, codec_extension, 'sd'))
1093 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
1095 for quality in ('hd', 'sd', 'other'):
1096 if len(files[quality]) > 0:
1097 video_quality = files[quality][0][2]
1098 video_codec = files[quality][0][0]
1099 video_extension = files[quality][0][1]
1100 self._downloader.to_screen(u'[vimeo] %s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
1103 self._downloader.trouble(u'ERROR: no known codec found')
1106 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
1107 %(video_id, sig, timestamp, video_quality, video_codec.upper())
1112 'uploader': video_uploader,
1113 'uploader_id': video_uploader_id,
1114 'upload_date': video_upload_date,
1115 'title': video_title,
1116 'ext': video_extension,
1117 'thumbnail': video_thumbnail,
1118 'description': video_description,
1122 class ArteTvIE(InfoExtractor):
1123 """arte.tv information extractor."""
1125 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
1126 _LIVE_URL = r'index-[0-9]+\.html$'
1128 IE_NAME = u'arte.tv'
1130 def __init__(self, downloader=None):
1131 InfoExtractor.__init__(self, downloader)
1133 def report_download_webpage(self, video_id):
1134 """Report webpage download."""
1135 self._downloader.to_screen(u'[arte.tv] %s: Downloading webpage' % video_id)
1137 def report_extraction(self, video_id):
1138 """Report information extraction."""
1139 self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
1141 def fetch_webpage(self, url):
1142 request = compat_urllib_request.Request(url)
1144 self.report_download_webpage(url)
1145 webpage = compat_urllib_request.urlopen(request).read()
1146 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1147 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1149 except ValueError as err:
1150 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1154 def grep_webpage(self, url, regex, regexFlags, matchTuples):
1155 page = self.fetch_webpage(url)
1156 mobj = re.search(regex, page, regexFlags)
1160 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1163 for (i, key, err) in matchTuples:
1164 if mobj.group(i) is None:
1165 self._downloader.trouble(err)
1168 info[key] = mobj.group(i)
1172 def extractLiveStream(self, url):
1173 video_lang = url.split('/')[-4]
1174 info = self.grep_webpage(
1176 r'src="(.*?/videothek_js.*?\.js)',
1179 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1182 http_host = url.split('/')[2]
1183 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
1184 info = self.grep_webpage(
1186 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
1187 '(http://.*?\.swf).*?' +
1191 (1, 'path', u'ERROR: could not extract video path: %s' % url),
1192 (2, 'player', u'ERROR: could not extract video player: %s' % url),
1193 (3, 'url', u'ERROR: could not extract video url: %s' % url)
1196 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
1198 def extractPlus7Stream(self, url):
1199 video_lang = url.split('/')[-3]
1200 info = self.grep_webpage(
1202 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
1205 (1, 'url', u'ERROR: Invalid URL: %s' % url)
1208 next_url = compat_urllib_parse.unquote(info.get('url'))
1209 info = self.grep_webpage(
1211 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
1214 (1, 'url', u'ERROR: Could not find <video> tag: %s' % url)
1217 next_url = compat_urllib_parse.unquote(info.get('url'))
1219 info = self.grep_webpage(
1221 r'<video id="(.*?)".*?>.*?' +
1222 '<name>(.*?)</name>.*?' +
1223 '<dateVideo>(.*?)</dateVideo>.*?' +
1224 '<url quality="hd">(.*?)</url>',
1227 (1, 'id', u'ERROR: could not extract video id: %s' % url),
1228 (2, 'title', u'ERROR: could not extract video title: %s' % url),
1229 (3, 'date', u'ERROR: could not extract video date: %s' % url),
1230 (4, 'url', u'ERROR: could not extract video url: %s' % url)
1235 'id': info.get('id'),
1236 'url': compat_urllib_parse.unquote(info.get('url')),
1237 'uploader': u'arte.tv',
1238 'upload_date': info.get('date'),
1239 'title': info.get('title').decode('utf-8'),
1245 def _real_extract(self, url):
1246 video_id = url.split('/')[-1]
1247 self.report_extraction(video_id)
1249 if re.search(self._LIVE_URL, video_id) is not None:
1250 self.extractLiveStream(url)
1253 info = self.extractPlus7Stream(url)
1258 class GenericIE(InfoExtractor):
1259 """Generic last-resort information extractor."""
1262 IE_NAME = u'generic'
1264 def __init__(self, downloader=None):
1265 InfoExtractor.__init__(self, downloader)
1267 def report_download_webpage(self, video_id):
1268 """Report webpage download."""
1269 self._downloader.to_screen(u'WARNING: Falling back on generic information extractor.')
1270 self._downloader.to_screen(u'[generic] %s: Downloading webpage' % video_id)
1272 def report_extraction(self, video_id):
1273 """Report information extraction."""
1274 self._downloader.to_screen(u'[generic] %s: Extracting information' % video_id)
1276 def report_following_redirect(self, new_url):
1277 """Report information extraction."""
1278 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
1280 def _test_redirect(self, url):
1281 """Check if it is a redirect, like url shorteners, in case restart chain."""
1282 class HeadRequest(compat_urllib_request.Request):
1283 def get_method(self):
1286 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
1288 Subclass the HTTPRedirectHandler to make it use our
1289 HeadRequest also on the redirected URL
1291 def redirect_request(self, req, fp, code, msg, headers, newurl):
1292 if code in (301, 302, 303, 307):
1293 newurl = newurl.replace(' ', '%20')
1294 newheaders = dict((k,v) for k,v in req.headers.items()
1295 if k.lower() not in ("content-length", "content-type"))
1296 return HeadRequest(newurl,
1298 origin_req_host=req.get_origin_req_host(),
1301 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
1303 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
1305 Fallback to GET if HEAD is not allowed (405 HTTP error)
1307 def http_error_405(self, req, fp, code, msg, headers):
1311 newheaders = dict((k,v) for k,v in req.headers.items()
1312 if k.lower() not in ("content-length", "content-type"))
1313 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
1315 origin_req_host=req.get_origin_req_host(),
1319 opener = compat_urllib_request.OpenerDirector()
1320 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
1321 HTTPMethodFallback, HEADRedirectHandler,
1322 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
1323 opener.add_handler(handler())
1325 response = opener.open(HeadRequest(url))
1326 new_url = response.geturl()
1331 self.report_following_redirect(new_url)
1332 self._downloader.download([new_url])
1335 def _real_extract(self, url):
1336 if self._test_redirect(url): return
1338 video_id = url.split('/')[-1]
1339 request = compat_urllib_request.Request(url)
1341 self.report_download_webpage(video_id)
1342 webpage = compat_urllib_request.urlopen(request).read()
1343 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1344 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
1346 except ValueError as err:
1347 # since this is the last-resort InfoExtractor, if
1348 # this error is thrown, it'll be thrown here
1349 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1352 self.report_extraction(video_id)
1353 # Start with something easy: JW Player in SWFObject
1354 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
1356 # Broaden the search a little bit
1357 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
1359 # Broaden the search a little bit: JWPlayer JS loader
1360 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
1362 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1365 # It's possible that one of the regexes
1366 # matched, but returned an empty group:
1367 if mobj.group(1) is None:
1368 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
1371 video_url = compat_urllib_parse.unquote(mobj.group(1))
1372 video_id = os.path.basename(video_url)
1374 # here's a fun little line of code for you:
1375 video_extension = os.path.splitext(video_id)[1][1:]
1376 video_id = os.path.splitext(video_id)[0]
1378 # it's tempting to parse this further, but you would
1379 # have to take into account all the variations like
1380 # Video Title - Site Name
1381 # Site Name | Video Title
1382 # Video Title - Tagline | Site Name
1383 # and so on and so forth; it's just not practical
1384 mobj = re.search(r'<title>(.*)</title>', webpage)
1386 self._downloader.trouble(u'ERROR: unable to extract title')
1388 video_title = mobj.group(1)
1390 # video uploader is domain name
1391 mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
1393 self._downloader.trouble(u'ERROR: unable to extract title')
1395 video_uploader = mobj.group(1)
1400 'uploader': video_uploader,
1401 'upload_date': None,
1402 'title': video_title,
1403 'ext': video_extension,
1407 class YoutubeSearchIE(InfoExtractor):
1408 """Information Extractor for YouTube search queries."""
1409 _VALID_URL = r'ytsearch(\d+|all)?:[\s\S]+'
1410 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1411 _max_youtube_results = 1000
1412 IE_NAME = u'youtube:search'
1414 def __init__(self, downloader=None):
1415 InfoExtractor.__init__(self, downloader)
1417 def report_download_page(self, query, pagenum):
1418 """Report attempt to download search page with given number."""
1419 query = query.decode(preferredencoding())
1420 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
1422 def _real_extract(self, query):
1423 mobj = re.match(self._VALID_URL, query)
1425 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1428 prefix, query = query.split(':')
1430 query = query.encode('utf-8')
1432 self._download_n_results(query, 1)
1434 elif prefix == 'all':
1435 self._download_n_results(query, self._max_youtube_results)
1441 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1443 elif n > self._max_youtube_results:
1444 self._downloader.to_stderr(u'WARNING: ytsearch returns max %i results (you requested %i)' % (self._max_youtube_results, n))
1445 n = self._max_youtube_results
1446 self._download_n_results(query, n)
1448 except ValueError: # parsing prefix as integer fails
1449 self._download_n_results(query, 1)
1452 def _download_n_results(self, query, n):
1453 """Downloads a specified number of results for a query"""
1459 while (50 * pagenum) < limit:
1460 self.report_download_page(query, pagenum+1)
1461 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
1462 request = compat_urllib_request.Request(result_url)
1464 data = compat_urllib_request.urlopen(request).read()
1465 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1466 self._downloader.trouble(u'ERROR: unable to download API page: %s' % compat_str(err))
1468 api_response = json.loads(data)['data']
1470 new_ids = list(video['id'] for video in api_response['items'])
1471 video_ids += new_ids
1473 limit = min(n, api_response['totalItems'])
1476 if len(video_ids) > n:
1477 video_ids = video_ids[:n]
1478 for id in video_ids:
1479 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1483 class GoogleSearchIE(InfoExtractor):
1484 """Information Extractor for Google Video search queries."""
1485 _VALID_URL = r'gvsearch(\d+|all)?:[\s\S]+'
1486 _TEMPLATE_URL = 'http://video.google.com/videosearch?q=%s+site:video.google.com&start=%s&hl=en'
1487 _VIDEO_INDICATOR = r'<a href="http://video\.google\.com/videoplay\?docid=([^"\&]+)'
1488 _MORE_PAGES_INDICATOR = r'class="pn" id="pnnext"'
1489 _max_google_results = 1000
1490 IE_NAME = u'video.google:search'
1492 def __init__(self, downloader=None):
1493 InfoExtractor.__init__(self, downloader)
1495 def report_download_page(self, query, pagenum):
1496 """Report attempt to download playlist page with given number."""
1497 query = query.decode(preferredencoding())
1498 self._downloader.to_screen(u'[video.google] query "%s": Downloading page %s' % (query, pagenum))
1500 def _real_extract(self, query):
1501 mobj = re.match(self._VALID_URL, query)
1503 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1506 prefix, query = query.split(':')
1508 query = query.encode('utf-8')
1510 self._download_n_results(query, 1)
1512 elif prefix == 'all':
1513 self._download_n_results(query, self._max_google_results)
1519 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1521 elif n > self._max_google_results:
1522 self._downloader.to_stderr(u'WARNING: gvsearch returns max %i results (you requested %i)' % (self._max_google_results, n))
1523 n = self._max_google_results
1524 self._download_n_results(query, n)
1526 except ValueError: # parsing prefix as integer fails
1527 self._download_n_results(query, 1)
1530 def _download_n_results(self, query, n):
1531 """Downloads a specified number of results for a query"""
1537 self.report_download_page(query, pagenum)
1538 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum*10)
1539 request = compat_urllib_request.Request(result_url)
1541 page = compat_urllib_request.urlopen(request).read()
1542 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1543 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1546 # Extract video identifiers
1547 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1548 video_id = mobj.group(1)
1549 if video_id not in video_ids:
1550 video_ids.append(video_id)
1551 if len(video_ids) == n:
1552 # Specified n videos reached
1553 for id in video_ids:
1554 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1557 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1558 for id in video_ids:
1559 self._downloader.download(['http://video.google.com/videoplay?docid=%s' % id])
1562 pagenum = pagenum + 1
1565 class YahooSearchIE(InfoExtractor):
1566 """Information Extractor for Yahoo! Video search queries."""
1569 _VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
1570 _TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
1571 _VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
1572 _MORE_PAGES_INDICATOR = r'\s*Next'
1573 _max_yahoo_results = 1000
1574 IE_NAME = u'video.yahoo:search'
1576 def __init__(self, downloader=None):
1577 InfoExtractor.__init__(self, downloader)
1579 def report_download_page(self, query, pagenum):
1580 """Report attempt to download playlist page with given number."""
1581 query = query.decode(preferredencoding())
1582 self._downloader.to_screen(u'[video.yahoo] query "%s": Downloading page %s' % (query, pagenum))
1584 def _real_extract(self, query):
1585 mobj = re.match(self._VALID_URL, query)
1587 self._downloader.trouble(u'ERROR: invalid search query "%s"' % query)
1590 prefix, query = query.split(':')
1592 query = query.encode('utf-8')
1594 self._download_n_results(query, 1)
1596 elif prefix == 'all':
1597 self._download_n_results(query, self._max_yahoo_results)
1603 self._downloader.trouble(u'ERROR: invalid download number %s for query "%s"' % (n, query))
1605 elif n > self._max_yahoo_results:
1606 self._downloader.to_stderr(u'WARNING: yvsearch returns max %i results (you requested %i)' % (self._max_yahoo_results, n))
1607 n = self._max_yahoo_results
1608 self._download_n_results(query, n)
1610 except ValueError: # parsing prefix as integer fails
1611 self._download_n_results(query, 1)
1614 def _download_n_results(self, query, n):
1615 """Downloads a specified number of results for a query"""
1618 already_seen = set()
1622 self.report_download_page(query, pagenum)
1623 result_url = self._TEMPLATE_URL % (compat_urllib_parse.quote_plus(query), pagenum)
1624 request = compat_urllib_request.Request(result_url)
1626 page = compat_urllib_request.urlopen(request).read()
1627 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1628 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1631 # Extract video identifiers
1632 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1633 video_id = mobj.group(1)
1634 if video_id not in already_seen:
1635 video_ids.append(video_id)
1636 already_seen.add(video_id)
1637 if len(video_ids) == n:
1638 # Specified n videos reached
1639 for id in video_ids:
1640 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1643 if re.search(self._MORE_PAGES_INDICATOR, page) is None:
1644 for id in video_ids:
1645 self._downloader.download(['http://video.yahoo.com/watch/%s' % id])
1648 pagenum = pagenum + 1
1651 class YoutubePlaylistIE(InfoExtractor):
1652 """Information Extractor for YouTube playlists."""
1654 _VALID_URL = r'(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:(?:course|view_play_list|my_playlists|artist|playlist)\?.*?(p|a|list)=|user/.*?/user/|p/|user/.*?#[pg]/c/)(?:PL|EC)?|PL|EC)([0-9A-Za-z-_]{10,})(?:/.*?/([0-9A-Za-z_-]+))?.*'
1655 _TEMPLATE_URL = 'http://www.youtube.com/%s?%s=%s&page=%s&gl=US&hl=en'
1656 _VIDEO_INDICATOR_TEMPLATE = r'/watch\?v=(.+?)&([^&"]+&)*list=.*?%s'
1657 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1658 IE_NAME = u'youtube:playlist'
1660 def __init__(self, downloader=None):
1661 InfoExtractor.__init__(self, downloader)
1663 def report_download_page(self, playlist_id, pagenum):
1664 """Report attempt to download playlist page with given number."""
1665 self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
1667 def _real_extract(self, url):
1668 # Extract playlist id
1669 mobj = re.match(self._VALID_URL, url)
1671 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1675 if mobj.group(3) is not None:
1676 self._downloader.download([mobj.group(3)])
1679 # Download playlist pages
1680 # prefix is 'p' as default for playlists but there are other types that need extra care
1681 playlist_prefix = mobj.group(1)
1682 if playlist_prefix == 'a':
1683 playlist_access = 'artist'
1685 playlist_prefix = 'p'
1686 playlist_access = 'view_play_list'
1687 playlist_id = mobj.group(2)
1692 self.report_download_page(playlist_id, pagenum)
1693 url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
1694 request = compat_urllib_request.Request(url)
1696 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1697 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1698 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1701 # Extract video identifiers
1703 for mobj in re.finditer(self._VIDEO_INDICATOR_TEMPLATE % playlist_id, page):
1704 if mobj.group(1) not in ids_in_page:
1705 ids_in_page.append(mobj.group(1))
1706 video_ids.extend(ids_in_page)
1708 if self._MORE_PAGES_INDICATOR not in page:
1710 pagenum = pagenum + 1
1712 total = len(video_ids)
1714 playliststart = self._downloader.params.get('playliststart', 1) - 1
1715 playlistend = self._downloader.params.get('playlistend', -1)
1716 if playlistend == -1:
1717 video_ids = video_ids[playliststart:]
1719 video_ids = video_ids[playliststart:playlistend]
1721 if len(video_ids) == total:
1722 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
1724 self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(video_ids)))
1726 for id in video_ids:
1727 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1731 class YoutubeChannelIE(InfoExtractor):
1732 """Information Extractor for YouTube channels."""
1734 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
1735 _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
1736 _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
1737 IE_NAME = u'youtube:channel'
1739 def report_download_page(self, channel_id, pagenum):
1740 """Report attempt to download channel page with given number."""
1741 self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
1743 def _real_extract(self, url):
1744 # Extract channel id
1745 mobj = re.match(self._VALID_URL, url)
1747 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1750 # Download channel pages
1751 channel_id = mobj.group(1)
1756 self.report_download_page(channel_id, pagenum)
1757 url = self._TEMPLATE_URL % (channel_id, pagenum)
1758 request = compat_urllib_request.Request(url)
1760 page = compat_urllib_request.urlopen(request).read().decode('utf8')
1761 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1762 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1765 # Extract video identifiers
1767 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
1768 if mobj.group(1) not in ids_in_page:
1769 ids_in_page.append(mobj.group(1))
1770 video_ids.extend(ids_in_page)
1772 if self._MORE_PAGES_INDICATOR not in page:
1774 pagenum = pagenum + 1
1776 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1778 for id in video_ids:
1779 self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
1783 class YoutubeUserIE(InfoExtractor):
1784 """Information Extractor for YouTube users."""
1786 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/user/)|ytuser:)([A-Za-z0-9_-]+)'
1787 _TEMPLATE_URL = 'http://gdata.youtube.com/feeds/api/users/%s'
1788 _GDATA_PAGE_SIZE = 50
1789 _GDATA_URL = 'http://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d'
1790 _VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
1791 IE_NAME = u'youtube:user'
1793 def __init__(self, downloader=None):
1794 InfoExtractor.__init__(self, downloader)
1796 def report_download_page(self, username, start_index):
1797 """Report attempt to download user page."""
1798 self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
1799 (username, start_index, start_index + self._GDATA_PAGE_SIZE))
1801 def _real_extract(self, url):
1803 mobj = re.match(self._VALID_URL, url)
1805 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1808 username = mobj.group(1)
1810 # Download video ids using YouTube Data API. Result size per
1811 # query is limited (currently to 50 videos) so we need to query
1812 # page by page until there are no video ids - it means we got
1819 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1820 self.report_download_page(username, start_index)
1822 request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
1825 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1826 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1827 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1830 # Extract video identifiers
1833 for mobj in re.finditer(self._VIDEO_INDICATOR, page):
1834 if mobj.group(1) not in ids_in_page:
1835 ids_in_page.append(mobj.group(1))
1837 video_ids.extend(ids_in_page)
1839 # A little optimization - if current page is not
1840 # "full", ie. does not contain PAGE_SIZE video ids then
1841 # we can assume that this page is the last one - there
1842 # are no more ids on further pages - no need to query
1845 if len(ids_in_page) < self._GDATA_PAGE_SIZE:
1850 all_ids_count = len(video_ids)
1851 playliststart = self._downloader.params.get('playliststart', 1) - 1
1852 playlistend = self._downloader.params.get('playlistend', -1)
1854 if playlistend == -1:
1855 video_ids = video_ids[playliststart:]
1857 video_ids = video_ids[playliststart:playlistend]
1859 self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
1860 (username, all_ids_count, len(video_ids)))
1862 for video_id in video_ids:
1863 self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
1866 class BlipTVUserIE(InfoExtractor):
1867 """Information Extractor for blip.tv users."""
1869 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
1871 IE_NAME = u'blip.tv:user'
1873 def __init__(self, downloader=None):
1874 InfoExtractor.__init__(self, downloader)
1876 def report_download_page(self, username, pagenum):
1877 """Report attempt to download user page."""
1878 self._downloader.to_screen(u'[%s] user %s: Downloading video ids from page %d' %
1879 (self.IE_NAME, username, pagenum))
1881 def _real_extract(self, url):
1883 mobj = re.match(self._VALID_URL, url)
1885 self._downloader.trouble(u'ERROR: invalid url: %s' % url)
1888 username = mobj.group(1)
1890 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
1892 request = compat_urllib_request.Request(url)
1895 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1896 mobj = re.search(r'data-users-id="([^"]+)"', page)
1897 page_base = page_base % mobj.group(1)
1898 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1899 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
1903 # Download video ids using BlipTV Ajax calls. Result size per
1904 # query is limited (currently to 12 videos) so we need to query
1905 # page by page until there are no video ids - it means we got
1912 self.report_download_page(username, pagenum)
1914 request = compat_urllib_request.Request( page_base + "&page=" + str(pagenum) )
1917 page = compat_urllib_request.urlopen(request).read().decode('utf-8')
1918 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1919 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % str(err))
1922 # Extract video identifiers
1925 for mobj in re.finditer(r'href="/([^"]+)"', page):
1926 if mobj.group(1) not in ids_in_page:
1927 ids_in_page.append(unescapeHTML(mobj.group(1)))
1929 video_ids.extend(ids_in_page)
1931 # A little optimization - if current page is not
1932 # "full", ie. does not contain PAGE_SIZE video ids then
1933 # we can assume that this page is the last one - there
1934 # are no more ids on further pages - no need to query
1937 if len(ids_in_page) < self._PAGE_SIZE:
1942 all_ids_count = len(video_ids)
1943 playliststart = self._downloader.params.get('playliststart', 1) - 1
1944 playlistend = self._downloader.params.get('playlistend', -1)
1946 if playlistend == -1:
1947 video_ids = video_ids[playliststart:]
1949 video_ids = video_ids[playliststart:playlistend]
1951 self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
1952 (self.IE_NAME, username, all_ids_count, len(video_ids)))
1954 for video_id in video_ids:
1955 self._downloader.download([u'http://blip.tv/'+video_id])
1958 class DepositFilesIE(InfoExtractor):
1959 """Information extractor for depositfiles.com"""
1961 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
1963 def report_download_webpage(self, file_id):
1964 """Report webpage download."""
1965 self._downloader.to_screen(u'[DepositFiles] %s: Downloading webpage' % file_id)
1967 def report_extraction(self, file_id):
1968 """Report information extraction."""
1969 self._downloader.to_screen(u'[DepositFiles] %s: Extracting information' % file_id)
1971 def _real_extract(self, url):
1972 file_id = url.split('/')[-1]
1973 # Rebuild url in english locale
1974 url = 'http://depositfiles.com/en/files/' + file_id
1976 # Retrieve file webpage with 'Free download' button pressed
1977 free_download_indication = { 'gateway_result' : '1' }
1978 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
1980 self.report_download_webpage(file_id)
1981 webpage = compat_urllib_request.urlopen(request).read()
1982 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1983 self._downloader.trouble(u'ERROR: Unable to retrieve file webpage: %s' % compat_str(err))
1986 # Search for the real file URL
1987 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
1988 if (mobj is None) or (mobj.group(1) is None):
1989 # Try to figure out reason of the error.
1990 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
1991 if (mobj is not None) and (mobj.group(1) is not None):
1992 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
1993 self._downloader.trouble(u'ERROR: %s' % restriction_message)
1995 self._downloader.trouble(u'ERROR: unable to extract download URL from: %s' % url)
1998 file_url = mobj.group(1)
1999 file_extension = os.path.splitext(file_url)[1][1:]
2001 # Search for file title
2002 mobj = re.search(r'<b title="(.*?)">', webpage)
2004 self._downloader.trouble(u'ERROR: unable to extract title')
2006 file_title = mobj.group(1).decode('utf-8')
2009 'id': file_id.decode('utf-8'),
2010 'url': file_url.decode('utf-8'),
2012 'upload_date': None,
2013 'title': file_title,
2014 'ext': file_extension.decode('utf-8'),
2018 class FacebookIE(InfoExtractor):
2019 """Information Extractor for Facebook"""
2021 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
2022 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
2023 _NETRC_MACHINE = 'facebook'
2024 IE_NAME = u'facebook'
2026 def report_login(self):
2027 """Report attempt to log in."""
2028 self._downloader.to_screen(u'[%s] Logging in' % self.IE_NAME)
2030 def _real_initialize(self):
2031 if self._downloader is None:
2036 downloader_params = self._downloader.params
2038 # Attempt to use provided username and password or .netrc data
2039 if downloader_params.get('username', None) is not None:
2040 useremail = downloader_params['username']
2041 password = downloader_params['password']
2042 elif downloader_params.get('usenetrc', False):
2044 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
2045 if info is not None:
2049 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
2050 except (IOError, netrc.NetrcParseError) as err:
2051 self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
2054 if useremail is None:
2063 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
2066 login_results = compat_urllib_request.urlopen(request).read()
2067 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
2068 self._downloader.to_stderr(u'WARNING: unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
2070 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2071 self._downloader.to_stderr(u'WARNING: unable to log in: %s' % compat_str(err))
2074 def _real_extract(self, url):
2075 mobj = re.match(self._VALID_URL, url)
2077 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2079 video_id = mobj.group('ID')
2081 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
2082 webpage = self._download_webpage(url, video_id)
2084 BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
2085 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
2086 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
2088 raise ExtractorError(u'Cannot parse data')
2089 data = dict(json.loads(m.group(1)))
2090 params_raw = compat_urllib_parse.unquote(data['params'])
2091 params = json.loads(params_raw)
2092 video_url = params['hd_src']
2094 video_url = params['sd_src']
2096 raise ExtractorError(u'Cannot find video URL')
2097 video_duration = int(params['video_duration'])
2099 m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
2101 raise ExtractorError(u'Cannot find title in webpage')
2102 video_title = unescapeHTML(m.group(1))
2106 'title': video_title,
2109 'duration': video_duration,
2110 'thumbnail': params['thumbnail_src'],
2115 class BlipTVIE(InfoExtractor):
2116 """Information extractor for blip.tv"""
2118 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv(/.+)$'
2119 _URL_EXT = r'^.*\.([a-z0-9]+)$'
2120 IE_NAME = u'blip.tv'
2122 def report_extraction(self, file_id):
2123 """Report information extraction."""
2124 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2126 def report_direct_download(self, title):
2127 """Report information extraction."""
2128 self._downloader.to_screen(u'[%s] %s: Direct download detected' % (self.IE_NAME, title))
2130 def _real_extract(self, url):
2131 mobj = re.match(self._VALID_URL, url)
2133 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2140 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
2141 request = compat_urllib_request.Request(json_url)
2142 request.add_header('User-Agent', 'iTunes/10.6.1')
2143 self.report_extraction(mobj.group(1))
2146 urlh = compat_urllib_request.urlopen(request)
2147 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
2148 basename = url.split('/')[-1]
2149 title,ext = os.path.splitext(basename)
2150 title = title.decode('UTF-8')
2151 ext = ext.replace('.', '')
2152 self.report_direct_download(title)
2157 'upload_date': None,
2162 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2163 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
2164 if info is None: # Regular URL
2166 json_code_bytes = urlh.read()
2167 json_code = json_code_bytes.decode('utf-8')
2168 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2169 self._downloader.trouble(u'ERROR: unable to read video info webpage: %s' % compat_str(err))
2173 json_data = json.loads(json_code)
2174 if 'Post' in json_data:
2175 data = json_data['Post']
2179 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
2180 video_url = data['media']['url']
2181 umobj = re.match(self._URL_EXT, video_url)
2183 raise ValueError('Can not determine filename extension')
2184 ext = umobj.group(1)
2187 'id': data['item_id'],
2189 'uploader': data['display_name'],
2190 'upload_date': upload_date,
2191 'title': data['title'],
2193 'format': data['media']['mimeType'],
2194 'thumbnail': data['thumbnailUrl'],
2195 'description': data['description'],
2196 'player_url': data['embedUrl'],
2197 'user_agent': 'iTunes/10.6.1',
2199 except (ValueError,KeyError) as err:
2200 self._downloader.trouble(u'ERROR: unable to parse video information: %s' % repr(err))
2206 class MyVideoIE(InfoExtractor):
2207 """Information Extractor for myvideo.de."""
2209 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
2210 IE_NAME = u'myvideo'
2212 def __init__(self, downloader=None):
2213 InfoExtractor.__init__(self, downloader)
2215 def report_extraction(self, video_id):
2216 """Report information extraction."""
2217 self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
2219 def _real_extract(self,url):
2220 mobj = re.match(self._VALID_URL, url)
2222 self._download.trouble(u'ERROR: invalid URL: %s' % url)
2225 video_id = mobj.group(1)
2228 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
2229 webpage = self._download_webpage(webpage_url, video_id)
2231 self.report_extraction(video_id)
2232 mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
2235 self._downloader.trouble(u'ERROR: unable to extract media URL')
2237 video_url = mobj.group(1) + ('/%s.flv' % video_id)
2239 mobj = re.search('<title>([^<]+)</title>', webpage)
2241 self._downloader.trouble(u'ERROR: unable to extract title')
2244 video_title = mobj.group(1)
2250 'upload_date': None,
2251 'title': video_title,
2255 class ComedyCentralIE(InfoExtractor):
2256 """Information extractor for The Daily Show and Colbert Report """
2258 # urls can be abbreviations like :thedailyshow or :colbert
2259 # urls for episodes like:
2260 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
2261 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
2262 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
2263 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
2264 |(https?://)?(www\.)?
2265 (?P<showname>thedailyshow|colbertnation)\.com/
2266 (full-episodes/(?P<episode>.*)|
2268 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
2269 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
2272 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
2274 _video_extensions = {
2282 _video_dimensions = {
2291 def suitable(self, url):
2292 """Receives a URL and returns True if suitable for this IE."""
2293 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
2295 def report_extraction(self, episode_id):
2296 self._downloader.to_screen(u'[comedycentral] %s: Extracting information' % episode_id)
2298 def report_config_download(self, episode_id, media_id):
2299 self._downloader.to_screen(u'[comedycentral] %s: Downloading configuration for %s' % (episode_id, media_id))
2301 def report_index_download(self, episode_id):
2302 self._downloader.to_screen(u'[comedycentral] %s: Downloading show index' % episode_id)
2304 def _print_formats(self, formats):
2305 print('Available formats:')
2307 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
2310 def _real_extract(self, url):
2311 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2313 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2316 if mobj.group('shortname'):
2317 if mobj.group('shortname') in ('tds', 'thedailyshow'):
2318 url = u'http://www.thedailyshow.com/full-episodes/'
2320 url = u'http://www.colbertnation.com/full-episodes/'
2321 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2322 assert mobj is not None
2324 if mobj.group('clip'):
2325 if mobj.group('showname') == 'thedailyshow':
2326 epTitle = mobj.group('tdstitle')
2328 epTitle = mobj.group('cntitle')
2331 dlNewest = not mobj.group('episode')
2333 epTitle = mobj.group('showname')
2335 epTitle = mobj.group('episode')
2337 req = compat_urllib_request.Request(url)
2338 self.report_extraction(epTitle)
2340 htmlHandle = compat_urllib_request.urlopen(req)
2341 html = htmlHandle.read()
2342 webpage = html.decode('utf-8')
2343 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2344 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2347 url = htmlHandle.geturl()
2348 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
2350 self._downloader.trouble(u'ERROR: Invalid redirected URL: ' + url)
2352 if mobj.group('episode') == '':
2353 self._downloader.trouble(u'ERROR: Redirected URL is still not specific: ' + url)
2355 epTitle = mobj.group('episode')
2357 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
2359 if len(mMovieParams) == 0:
2360 # The Colbert Report embeds the information in a without
2361 # a URL prefix; so extract the alternate reference
2362 # and then add the URL prefix manually.
2364 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
2365 if len(altMovieParams) == 0:
2366 self._downloader.trouble(u'ERROR: unable to find Flash URL in webpage ' + url)
2369 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
2371 uri = mMovieParams[0][1]
2372 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
2373 self.report_index_download(epTitle)
2375 indexXml = compat_urllib_request.urlopen(indexUrl).read()
2376 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2377 self._downloader.trouble(u'ERROR: unable to download episode index: ' + compat_str(err))
2382 idoc = xml.etree.ElementTree.fromstring(indexXml)
2383 itemEls = idoc.findall('.//item')
2384 for partNum,itemEl in enumerate(itemEls):
2385 mediaId = itemEl.findall('./guid')[0].text
2386 shortMediaId = mediaId.split(':')[-1]
2387 showId = mediaId.split(':')[-2].replace('.com', '')
2388 officialTitle = itemEl.findall('./title')[0].text
2389 officialDate = itemEl.findall('./pubDate')[0].text
2391 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
2392 compat_urllib_parse.urlencode({'uri': mediaId}))
2393 configReq = compat_urllib_request.Request(configUrl)
2394 self.report_config_download(epTitle, shortMediaId)
2396 configXml = compat_urllib_request.urlopen(configReq).read()
2397 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2398 self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
2401 cdoc = xml.etree.ElementTree.fromstring(configXml)
2403 for rendition in cdoc.findall('.//rendition'):
2404 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
2408 self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
2411 if self._downloader.params.get('listformats', None):
2412 self._print_formats([i[0] for i in turls])
2415 # For now, just pick the highest bitrate
2416 format,rtmp_video_url = turls[-1]
2418 # Get the format arg from the arg stream
2419 req_format = self._downloader.params.get('format', None)
2421 # Select format if we can find one
2424 format, rtmp_video_url = f, v
2427 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
2429 raise ExtractorError(u'Cannot transform RTMP url')
2430 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
2431 video_url = base + m.group('finalid')
2433 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
2438 'upload_date': officialDate,
2443 'description': officialTitle,
2445 results.append(info)
2450 class EscapistIE(InfoExtractor):
2451 """Information extractor for The Escapist """
2453 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
2454 IE_NAME = u'escapist'
2456 def report_extraction(self, showName):
2457 self._downloader.to_screen(u'[escapist] %s: Extracting information' % showName)
2459 def report_config_download(self, showName):
2460 self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)
2462 def _real_extract(self, url):
2463 mobj = re.match(self._VALID_URL, url)
2465 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2467 showName = mobj.group('showname')
2468 videoId = mobj.group('episode')
2470 self.report_extraction(showName)
2472 webPage = compat_urllib_request.urlopen(url)
2473 webPageBytes = webPage.read()
2474 m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
2475 webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
2476 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2477 self._downloader.trouble(u'ERROR: unable to download webpage: ' + compat_str(err))
2480 descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
2481 description = unescapeHTML(descMatch.group(1))
2482 imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
2483 imgUrl = unescapeHTML(imgMatch.group(1))
2484 playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
2485 playerUrl = unescapeHTML(playerUrlMatch.group(1))
2486 configUrlMatch = re.search('config=(.*)$', playerUrl)
2487 configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
2489 self.report_config_download(showName)
2491 configJSON = compat_urllib_request.urlopen(configUrl)
2492 m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
2493 configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
2494 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2495 self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
2498 # Technically, it's JavaScript, not JSON
2499 configJSON = configJSON.replace("'", '"')
2502 config = json.loads(configJSON)
2503 except (ValueError,) as err:
2504 self._downloader.trouble(u'ERROR: Invalid JSON in configuration file: ' + compat_str(err))
2507 playlist = config['playlist']
2508 videoUrl = playlist[1]['url']
2513 'uploader': showName,
2514 'upload_date': None,
2517 'thumbnail': imgUrl,
2518 'description': description,
2519 'player_url': playerUrl,
2524 class CollegeHumorIE(InfoExtractor):
2525 """Information extractor for collegehumor.com"""
2528 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
2529 IE_NAME = u'collegehumor'
2531 def report_manifest(self, video_id):
2532 """Report information extraction."""
2533 self._downloader.to_screen(u'[%s] %s: Downloading XML manifest' % (self.IE_NAME, video_id))
2535 def report_extraction(self, video_id):
2536 """Report information extraction."""
2537 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2539 def _real_extract(self, url):
2540 mobj = re.match(self._VALID_URL, url)
2542 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2544 video_id = mobj.group('videoid')
2549 'upload_date': None,
2552 self.report_extraction(video_id)
2553 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
2555 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2556 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2557 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2560 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2562 videoNode = mdoc.findall('./video')[0]
2563 info['description'] = videoNode.findall('./description')[0].text
2564 info['title'] = videoNode.findall('./caption')[0].text
2565 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
2566 manifest_url = videoNode.findall('./file')[0].text
2568 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2571 manifest_url += '?hdcore=2.10.3'
2572 self.report_manifest(video_id)
2574 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
2575 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2576 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2579 adoc = xml.etree.ElementTree.fromstring(manifestXml)
2581 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
2582 node_id = media_node.attrib['url']
2583 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
2584 except IndexError as err:
2585 self._downloader.trouble(u'\nERROR: Invalid manifest file')
2588 url_pr = compat_urllib_parse_urlparse(manifest_url)
2589 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
2596 class XVideosIE(InfoExtractor):
2597 """Information extractor for xvideos.com"""
2599 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
2600 IE_NAME = u'xvideos'
2602 def report_extraction(self, video_id):
2603 """Report information extraction."""
2604 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2606 def _real_extract(self, url):
2607 mobj = re.match(self._VALID_URL, url)
2609 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2611 video_id = mobj.group(1)
2613 webpage = self._download_webpage(url, video_id)
2615 self.report_extraction(video_id)
2619 mobj = re.search(r'flv_url=(.+?)&', webpage)
2621 self._downloader.trouble(u'ERROR: unable to extract video url')
2623 video_url = compat_urllib_parse.unquote(mobj.group(1))
2627 mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
2629 self._downloader.trouble(u'ERROR: unable to extract video title')
2631 video_title = mobj.group(1)
2634 # Extract video thumbnail
2635 mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
2637 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
2639 video_thumbnail = mobj.group(0)
2645 'upload_date': None,
2646 'title': video_title,
2648 'thumbnail': video_thumbnail,
2649 'description': None,
2655 class SoundcloudIE(InfoExtractor):
2656 """Information extractor for soundcloud.com
2657 To access the media, the uid of the song and a stream token
2658 must be extracted from the page source and the script must make
2659 a request to media.soundcloud.com/crossdomain.xml. Then
2660 the media can be grabbed by requesting from an url composed
2661 of the stream token and uid
2664 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
2665 IE_NAME = u'soundcloud'
2667 def __init__(self, downloader=None):
2668 InfoExtractor.__init__(self, downloader)
2670 def report_resolve(self, video_id):
2671 """Report information extraction."""
2672 self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
2674 def report_extraction(self, video_id):
2675 """Report information extraction."""
2676 self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
2678 def _real_extract(self, url):
2679 mobj = re.match(self._VALID_URL, url)
2681 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2684 # extract uploader (which is in the url)
2685 uploader = mobj.group(1)
2686 # extract simple title (uploader + slug of song title)
2687 slug_title = mobj.group(2)
2688 simple_title = uploader + u'-' + slug_title
2690 self.report_resolve('%s/%s' % (uploader, slug_title))
2692 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
2693 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2694 request = compat_urllib_request.Request(resolv_url)
2696 info_json_bytes = compat_urllib_request.urlopen(request).read()
2697 info_json = info_json_bytes.decode('utf-8')
2698 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2699 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
2702 info = json.loads(info_json)
2703 video_id = info['id']
2704 self.report_extraction('%s/%s' % (uploader, slug_title))
2706 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
2707 request = compat_urllib_request.Request(streams_url)
2709 stream_json_bytes = compat_urllib_request.urlopen(request).read()
2710 stream_json = stream_json_bytes.decode('utf-8')
2711 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2712 self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
2715 streams = json.loads(stream_json)
2716 mediaURL = streams['http_mp3_128_url']
2721 'uploader': info['user']['username'],
2722 'upload_date': info['created_at'],
2723 'title': info['title'],
2725 'description': info['description'],
2729 class InfoQIE(InfoExtractor):
2730 """Information extractor for infoq.com"""
2731 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
2733 def report_extraction(self, video_id):
2734 """Report information extraction."""
2735 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2737 def _real_extract(self, url):
2738 mobj = re.match(self._VALID_URL, url)
2740 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2743 webpage = self._download_webpage(url, video_id=url)
2744 self.report_extraction(url)
2747 mobj = re.search(r"jsclassref='([^']*)'", webpage)
2749 self._downloader.trouble(u'ERROR: unable to extract video url')
2751 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
2752 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
2755 mobj = re.search(r'contentTitle = "(.*?)";', webpage)
2757 self._downloader.trouble(u'ERROR: unable to extract video title')
2759 video_title = mobj.group(1)
2761 # Extract description
2762 video_description = u'No description available.'
2763 mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
2764 if mobj is not None:
2765 video_description = mobj.group(1)
2767 video_filename = video_url.split('/')[-1]
2768 video_id, extension = video_filename.split('.')
2774 'upload_date': None,
2775 'title': video_title,
2776 'ext': extension, # Extension is always(?) mp4, but seems to be flv
2778 'description': video_description,
2783 class MixcloudIE(InfoExtractor):
2784 """Information extractor for www.mixcloud.com"""
2786 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
2787 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
2788 IE_NAME = u'mixcloud'
2790 def __init__(self, downloader=None):
2791 InfoExtractor.__init__(self, downloader)
2793 def report_download_json(self, file_id):
2794 """Report JSON download."""
2795 self._downloader.to_screen(u'[%s] Downloading json' % self.IE_NAME)
2797 def report_extraction(self, file_id):
2798 """Report information extraction."""
2799 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
2801 def get_urls(self, jsonData, fmt, bitrate='best'):
2802 """Get urls from 'audio_formats' section in json"""
2805 bitrate_list = jsonData[fmt]
2806 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
2807 bitrate = max(bitrate_list) # select highest
2809 url_list = jsonData[fmt][bitrate]
2810 except TypeError: # we have no bitrate info.
2811 url_list = jsonData[fmt]
2814 def check_urls(self, url_list):
2815 """Returns 1st active url from list"""
2816 for url in url_list:
2818 compat_urllib_request.urlopen(url)
2820 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2825 def _print_formats(self, formats):
2826 print('Available formats:')
2827 for fmt in formats.keys():
2828 for b in formats[fmt]:
2830 ext = formats[fmt][b][0]
2831 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
2832 except TypeError: # we have no bitrate info
2833 ext = formats[fmt][0]
2834 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
2837 def _real_extract(self, url):
2838 mobj = re.match(self._VALID_URL, url)
2840 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
2842 # extract uploader & filename from url
2843 uploader = mobj.group(1).decode('utf-8')
2844 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
2846 # construct API request
2847 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
2848 # retrieve .json file with links to files
2849 request = compat_urllib_request.Request(file_url)
2851 self.report_download_json(file_url)
2852 jsonData = compat_urllib_request.urlopen(request).read()
2853 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2854 self._downloader.trouble(u'ERROR: Unable to retrieve file: %s' % compat_str(err))
2858 json_data = json.loads(jsonData)
2859 player_url = json_data['player_swf_url']
2860 formats = dict(json_data['audio_formats'])
2862 req_format = self._downloader.params.get('format', None)
2865 if self._downloader.params.get('listformats', None):
2866 self._print_formats(formats)
2869 if req_format is None or req_format == 'best':
2870 for format_param in formats.keys():
2871 url_list = self.get_urls(formats, format_param)
2873 file_url = self.check_urls(url_list)
2874 if file_url is not None:
2877 if req_format not in formats:
2878 self._downloader.trouble(u'ERROR: format is not available')
2881 url_list = self.get_urls(formats, req_format)
2882 file_url = self.check_urls(url_list)
2883 format_param = req_format
2886 'id': file_id.decode('utf-8'),
2887 'url': file_url.decode('utf-8'),
2888 'uploader': uploader.decode('utf-8'),
2889 'upload_date': None,
2890 'title': json_data['name'],
2891 'ext': file_url.split('.')[-1].decode('utf-8'),
2892 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
2893 'thumbnail': json_data['thumbnail_url'],
2894 'description': json_data['description'],
2895 'player_url': player_url.decode('utf-8'),
2898 class StanfordOpenClassroomIE(InfoExtractor):
2899 """Information extractor for Stanford's Open ClassRoom"""
2901 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
2902 IE_NAME = u'stanfordoc'
2904 def report_download_webpage(self, objid):
2905 """Report information extraction."""
2906 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, objid))
2908 def report_extraction(self, video_id):
2909 """Report information extraction."""
2910 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
2912 def _real_extract(self, url):
2913 mobj = re.match(self._VALID_URL, url)
2915 raise ExtractorError(u'Invalid URL: %s' % url)
2917 if mobj.group('course') and mobj.group('video'): # A specific video
2918 course = mobj.group('course')
2919 video = mobj.group('video')
2921 'id': course + '_' + video,
2923 'upload_date': None,
2926 self.report_extraction(info['id'])
2927 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
2928 xmlUrl = baseUrl + video + '.xml'
2930 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
2931 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2932 self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
2934 mdoc = xml.etree.ElementTree.fromstring(metaXml)
2936 info['title'] = mdoc.findall('./title')[0].text
2937 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
2939 self._downloader.trouble(u'\nERROR: Invalid metadata XML file')
2941 info['ext'] = info['url'].rpartition('.')[2]
2943 elif mobj.group('course'): # A course page
2944 course = mobj.group('course')
2949 'upload_date': None,
2952 coursepage = self._download_webpage(url, info['id'],
2953 note='Downloading course info page',
2954 errnote='Unable to download course info page')
2956 m = re.search('<h1>([^<]+)</h1>', coursepage)
2958 info['title'] = unescapeHTML(m.group(1))
2960 info['title'] = info['id']
2962 m = re.search('<description>([^<]+)</description>', coursepage)
2964 info['description'] = unescapeHTML(m.group(1))
2966 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
2969 'type': 'reference',
2970 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
2974 for entry in info['list']:
2975 assert entry['type'] == 'reference'
2976 results += self.extract(entry['url'])
2980 'id': 'Stanford OpenClassroom',
2983 'upload_date': None,
2986 self.report_download_webpage(info['id'])
2987 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
2989 rootpage = compat_urllib_request.urlopen(rootURL).read()
2990 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2991 self._downloader.trouble(u'ERROR: unable to download course info page: ' + compat_str(err))
2994 info['title'] = info['id']
2996 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
2999 'type': 'reference',
3000 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
3005 for entry in info['list']:
3006 assert entry['type'] == 'reference'
3007 results += self.extract(entry['url'])
3010 class MTVIE(InfoExtractor):
3011 """Information extractor for MTV.com"""
3013 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
3016 def report_extraction(self, video_id):
3017 """Report information extraction."""
3018 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3020 def _real_extract(self, url):
3021 mobj = re.match(self._VALID_URL, url)
3023 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3025 if not mobj.group('proto'):
3026 url = 'http://' + url
3027 video_id = mobj.group('videoid')
3029 webpage = self._download_webpage(url, video_id)
3031 mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
3033 self._downloader.trouble(u'ERROR: unable to extract song name')
3035 song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3036 mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
3038 self._downloader.trouble(u'ERROR: unable to extract performer')
3040 performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
3041 video_title = performer + ' - ' + song_name
3043 mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
3045 self._downloader.trouble(u'ERROR: unable to mtvn_uri')
3047 mtvn_uri = mobj.group(1)
3049 mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
3051 self._downloader.trouble(u'ERROR: unable to extract content id')
3053 content_id = mobj.group(1)
3055 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
3056 self.report_extraction(video_id)
3057 request = compat_urllib_request.Request(videogen_url)
3059 metadataXml = compat_urllib_request.urlopen(request).read()
3060 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3061 self._downloader.trouble(u'ERROR: unable to download video metadata: %s' % compat_str(err))
3064 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
3065 renditions = mdoc.findall('.//rendition')
3067 # For now, always pick the highest quality.
3068 rendition = renditions[-1]
3071 _,_,ext = rendition.attrib['type'].partition('/')
3072 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
3073 video_url = rendition.find('./src').text
3075 self._downloader.trouble('Invalid rendition field.')
3081 'uploader': performer,
3082 'upload_date': None,
3083 'title': video_title,
3091 class YoukuIE(InfoExtractor):
3092 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
3094 def report_download_webpage(self, file_id):
3095 """Report webpage download."""
3096 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
3098 def report_extraction(self, file_id):
3099 """Report information extraction."""
3100 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3103 nowTime = int(time.time() * 1000)
3104 random1 = random.randint(1000,1998)
3105 random2 = random.randint(1000,9999)
3107 return "%d%d%d" %(nowTime,random1,random2)
3109 def _get_file_ID_mix_string(self, seed):
3111 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
3113 for i in range(len(source)):
3114 seed = (seed * 211 + 30031 ) % 65536
3115 index = math.floor(seed / 65536 * len(source) )
3116 mixed.append(source[int(index)])
3117 source.remove(source[int(index)])
3118 #return ''.join(mixed)
3121 def _get_file_id(self, fileId, seed):
3122 mixed = self._get_file_ID_mix_string(seed)
3123 ids = fileId.split('*')
3127 realId.append(mixed[int(ch)])
3128 return ''.join(realId)
3130 def _real_extract(self, url):
3131 mobj = re.match(self._VALID_URL, url)
3133 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3135 video_id = mobj.group('ID')
3137 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
3139 request = compat_urllib_request.Request(info_url, None, std_headers)
3141 self.report_download_webpage(video_id)
3142 jsondata = compat_urllib_request.urlopen(request).read()
3143 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3144 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3147 self.report_extraction(video_id)
3149 jsonstr = jsondata.decode('utf-8')
3150 config = json.loads(jsonstr)
3152 video_title = config['data'][0]['title']
3153 seed = config['data'][0]['seed']
3155 format = self._downloader.params.get('format', None)
3156 supported_format = list(config['data'][0]['streamfileids'].keys())
3158 if format is None or format == 'best':
3159 if 'hd2' in supported_format:
3164 elif format == 'worst':
3172 fileid = config['data'][0]['streamfileids'][format]
3173 keys = [s['k'] for s in config['data'][0]['segs'][format]]
3174 except (UnicodeDecodeError, ValueError, KeyError):
3175 self._downloader.trouble(u'ERROR: unable to extract info section')
3179 sid = self._gen_sid()
3180 fileid = self._get_file_id(fileid, seed)
3182 #column 8,9 of fileid represent the segment number
3183 #fileid[7:9] should be changed
3184 for index, key in enumerate(keys):
3186 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
3187 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
3190 'id': '%s_part%02d' % (video_id, index),
3191 'url': download_url,
3193 'upload_date': None,
3194 'title': video_title,
3197 files_info.append(info)
3202 class XNXXIE(InfoExtractor):
3203 """Information extractor for xnxx.com"""
3205 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
3207 VIDEO_URL_RE = r'flv_url=(.*?)&'
3208 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
3209 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
3211 def report_webpage(self, video_id):
3212 """Report information extraction"""
3213 self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
3215 def report_extraction(self, video_id):
3216 """Report information extraction"""
3217 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
3219 def _real_extract(self, url):
3220 mobj = re.match(self._VALID_URL, url)
3222 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3224 video_id = mobj.group(1)
3226 self.report_webpage(video_id)
3228 # Get webpage content
3230 webpage_bytes = compat_urllib_request.urlopen(url).read()
3231 webpage = webpage_bytes.decode('utf-8')
3232 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3233 self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % err)
3236 result = re.search(self.VIDEO_URL_RE, webpage)
3238 self._downloader.trouble(u'ERROR: unable to extract video url')
3240 video_url = compat_urllib_parse.unquote(result.group(1))
3242 result = re.search(self.VIDEO_TITLE_RE, webpage)
3244 self._downloader.trouble(u'ERROR: unable to extract video title')
3246 video_title = result.group(1)
3248 result = re.search(self.VIDEO_THUMB_RE, webpage)
3250 self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
3252 video_thumbnail = result.group(1)
3258 'upload_date': None,
3259 'title': video_title,
3261 'thumbnail': video_thumbnail,
3262 'description': None,
3266 class GooglePlusIE(InfoExtractor):
3267 """Information extractor for plus.google.com."""
3269 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
3270 IE_NAME = u'plus.google'
3272 def __init__(self, downloader=None):
3273 InfoExtractor.__init__(self, downloader)
3275 def report_extract_entry(self, url):
3276 """Report downloading extry"""
3277 self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
3279 def report_date(self, upload_date):
3280 """Report downloading extry"""
3281 self._downloader.to_screen(u'[plus.google] Entry date: %s' % upload_date)
3283 def report_uploader(self, uploader):
3284 """Report downloading extry"""
3285 self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
3287 def report_title(self, video_title):
3288 """Report downloading extry"""
3289 self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
3291 def report_extract_vid_page(self, video_page):
3292 """Report information extraction."""
3293 self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
3295 def _real_extract(self, url):
3296 # Extract id from URL
3297 mobj = re.match(self._VALID_URL, url)
3299 self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
3302 post_url = mobj.group(0)
3303 video_id = mobj.group(1)
3305 video_extension = 'flv'
3307 # Step 1, Retrieve post webpage to extract further information
3308 self.report_extract_entry(post_url)
3309 request = compat_urllib_request.Request(post_url)
3311 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3312 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3313 self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
3316 # Extract update date
3318 pattern = 'title="Timestamp">(.*?)</a>'
3319 mobj = re.search(pattern, webpage)
3321 upload_date = mobj.group(1)
3322 # Convert timestring to a format suitable for filename
3323 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
3324 upload_date = upload_date.strftime('%Y%m%d')
3325 self.report_date(upload_date)
3329 pattern = r'rel\="author".*?>(.*?)</a>'
3330 mobj = re.search(pattern, webpage)
3332 uploader = mobj.group(1)
3333 self.report_uploader(uploader)
3336 # Get the first line for title
3338 pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
3339 mobj = re.search(pattern, webpage)
3341 video_title = mobj.group(1)
3342 self.report_title(video_title)
3344 # Step 2, Stimulate clicking the image box to launch video
3345 pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
3346 mobj = re.search(pattern, webpage)
3348 self._downloader.trouble(u'ERROR: unable to extract video page URL')
3350 video_page = mobj.group(1)
3351 request = compat_urllib_request.Request(video_page)
3353 webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
3354 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3355 self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
3357 self.report_extract_vid_page(video_page)
3360 # Extract video links on video page
3361 """Extract video links of all sizes"""
3362 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
3363 mobj = re.findall(pattern, webpage)
3365 self._downloader.trouble(u'ERROR: unable to extract video links')
3367 # Sort in resolution
3368 links = sorted(mobj)
3370 # Choose the lowest of the sort, i.e. highest resolution
3371 video_url = links[-1]
3372 # Only get the url. The resolution part in the tuple has no use anymore
3373 video_url = video_url[-1]
3374 # Treat escaped \u0026 style hex
3376 video_url = video_url.decode("unicode_escape")
3377 except AttributeError: # Python 3
3378 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
3384 'uploader': uploader,
3385 'upload_date': upload_date,
3386 'title': video_title,
3387 'ext': video_extension,
3390 class NBAIE(InfoExtractor):
3391 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
3394 def _real_extract(self, url):
3395 mobj = re.match(self._VALID_URL, url)
3397 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3400 video_id = mobj.group(1)
3401 if video_id.endswith('/index.html'):
3402 video_id = video_id[:-len('/index.html')]
3404 webpage = self._download_webpage(url, video_id)
3406 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
3407 def _findProp(rexp, default=None):
3408 m = re.search(rexp, webpage)
3410 return unescapeHTML(m.group(1))
3414 shortened_video_id = video_id.rpartition('/')[2]
3415 title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
3417 'id': shortened_video_id,
3421 'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
3422 'description': _findProp(r'<div class="description">(.*?)</h1>'),
3426 class JustinTVIE(InfoExtractor):
3427 """Information extractor for justin.tv and twitch.tv"""
3428 # TODO: One broadcast may be split into multiple videos. The key
3429 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
3430 # starts at 1 and increases. Can we treat all parts as one video?
3432 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
3433 ([^/]+)(?:/b/([^/]+))?/?(?:\#.*)?$"""
3434 _JUSTIN_PAGE_LIMIT = 100
3435 IE_NAME = u'justin.tv'
3437 def report_extraction(self, file_id):
3438 """Report information extraction."""
3439 self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
3441 def report_download_page(self, channel, offset):
3442 """Report attempt to download a single page of videos."""
3443 self._downloader.to_screen(u'[%s] %s: Downloading video information from %d to %d' %
3444 (self.IE_NAME, channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
3446 # Return count of items, list of *valid* items
3447 def _parse_page(self, url):
3449 urlh = compat_urllib_request.urlopen(url)
3450 webpage_bytes = urlh.read()
3451 webpage = webpage_bytes.decode('utf-8', 'ignore')
3452 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
3453 self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
3456 response = json.loads(webpage)
3457 if type(response) != list:
3458 error_text = response.get('error', 'unknown error')
3459 self._downloader.trouble(u'ERROR: Justin.tv API: %s' % error_text)
3462 for clip in response:
3463 video_url = clip['video_file_url']
3465 video_extension = os.path.splitext(video_url)[1][1:]
3466 video_date = re.sub('-', '', clip['start_time'][:10])
3467 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
3468 video_id = clip['id']
3469 video_title = clip.get('title', video_id)
3473 'title': video_title,
3474 'uploader': clip.get('channel_name', video_uploader_id),
3475 'uploader_id': video_uploader_id,
3476 'upload_date': video_date,
3477 'ext': video_extension,
3479 return (len(response), info)
3481 def _real_extract(self, url):
3482 mobj = re.match(self._VALID_URL, url)
3484 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3487 api = 'http://api.justin.tv'
3488 video_id = mobj.group(mobj.lastindex)
3490 if mobj.lastindex == 1:
3492 api += '/channel/archives/%s.json'
3494 api += '/broadcast/by_archive/%s.json'
3495 api = api % (video_id,)
3497 self.report_extraction(video_id)
3501 limit = self._JUSTIN_PAGE_LIMIT
3504 self.report_download_page(video_id, offset)
3505 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
3506 page_count, page_info = self._parse_page(page_url)
3507 info.extend(page_info)
3508 if not paged or page_count != limit:
3513 class FunnyOrDieIE(InfoExtractor):
3514 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
3516 def _real_extract(self, url):
3517 mobj = re.match(self._VALID_URL, url)
3519 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3522 video_id = mobj.group('id')
3523 webpage = self._download_webpage(url, video_id)
3525 m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
3527 self._downloader.trouble(u'ERROR: unable to find video information')
3528 video_url = unescapeHTML(m.group('url'))
3530 m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
3532 self._downloader.trouble(u'Cannot find video title')
3533 title = unescapeHTML(m.group('title'))
3535 m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
3537 desc = unescapeHTML(m.group('desc'))
3546 'description': desc,
3550 class TweetReelIE(InfoExtractor):
3551 _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
3553 def _real_extract(self, url):
3554 mobj = re.match(self._VALID_URL, url)
3556 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3559 video_id = mobj.group('id')
3560 webpage = self._download_webpage(url, video_id)
3562 m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
3564 self._downloader.trouble(u'ERROR: Cannot find status ID')
3565 status_id = m.group(1)
3567 m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
3569 self._downloader.trouble(u'WARNING: Cannot find description')
3570 desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
3572 m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
3574 self._downloader.trouble(u'ERROR: Cannot find uploader')
3575 uploader = unescapeHTML(m.group('uploader'))
3576 uploader_id = unescapeHTML(m.group('uploader_id'))
3578 m = re.search(r'<span unixtime="([0-9]+)"', webpage)
3580 self._downloader.trouble(u'ERROR: Cannot find upload date')
3581 upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
3584 video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
3591 'description': desc,
3592 'uploader': uploader,
3593 'uploader_id': uploader_id,
3594 'internal_id': status_id,
3595 'upload_date': upload_date
3599 class SteamIE(InfoExtractor):
3600 _VALID_URL = r"""http://store.steampowered.com/
3601 (?P<urltype>video|app)/ #If the page is only for videos or for a game
3603 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
3606 def suitable(self, url):
3607 """Receives a URL and returns True if suitable for this IE."""
3608 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3610 def _real_extract(self, url):
3611 m = re.match(self._VALID_URL, url, re.VERBOSE)
3612 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
3613 gameID = m.group('gameID')
3614 videourl = 'http://store.steampowered.com/video/%s/' % gameID
3615 webpage = self._download_webpage(videourl, gameID)
3616 mweb = re.finditer(urlRE, webpage)
3617 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
3618 titles = re.finditer(namesRE, webpage)
3620 for vid,vtitle in zip(mweb,titles):
3621 video_id = vid.group('videoID')
3622 title = vtitle.group('videoName')
3623 video_url = vid.group('videoURL')
3625 self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
3630 'title': unescapeHTML(title)
3635 class UstreamIE(InfoExtractor):
3636 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
3637 IE_NAME = u'ustream'
3639 def _real_extract(self, url):
3640 m = re.match(self._VALID_URL, url)
3641 video_id = m.group('videoID')
3642 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
3643 webpage = self._download_webpage(url, video_id)
3644 m = re.search(r'data-title="(?P<title>.+)"',webpage)
3645 title = m.group('title')
3646 m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
3647 uploader = m.group('uploader')
3653 'uploader': uploader
3657 class RBMARadioIE(InfoExtractor):
3658 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
3660 def _real_extract(self, url):
3661 m = re.match(self._VALID_URL, url)
3662 video_id = m.group('videoID')
3664 webpage = self._download_webpage(url, video_id)
3665 m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
3667 raise ExtractorError(u'Cannot find metadata')
3668 json_data = m.group(1)
3671 data = json.loads(json_data)
3672 except ValueError as e:
3673 raise ExtractorError(u'Invalid JSON: ' + str(e))
3675 video_url = data['akamai_url'] + '&cbr=256'
3676 url_parts = compat_urllib_parse_urlparse(video_url)
3677 video_ext = url_parts.path.rpartition('.')[2]
3682 'title': data['title'],
3683 'description': data.get('teaser_text'),
3684 'location': data.get('country_of_origin'),
3685 'uploader': data.get('host', {}).get('name'),
3686 'uploader_id': data.get('host', {}).get('slug'),
3687 'thumbnail': data.get('image', {}).get('large_url_2x'),
3688 'duration': data.get('duration'),
3693 class YouPornIE(InfoExtractor):
3694 """Information extractor for youporn.com."""
3695 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
3697 def _print_formats(self, formats):
3698 """Print all available formats"""
3699 print(u'Available formats:')
3700 print(u'ext\t\tformat')
3701 print(u'---------------------------------')
3702 for format in formats:
3703 print(u'%s\t\t%s' % (format['ext'], format['format']))
3705 def _specific(self, req_format, formats):
3707 if(x["format"]==req_format):
3711 def _real_extract(self, url):
3712 mobj = re.match(self._VALID_URL, url)
3714 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3717 video_id = mobj.group('videoid')
3719 req = compat_urllib_request.Request(url)
3720 req.add_header('Cookie', 'age_verified=1')
3721 webpage = self._download_webpage(req, video_id)
3723 # Get the video title
3724 result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
3726 raise ExtractorError(u'Unable to extract video title')
3727 video_title = result.group('title').strip()
3729 # Get the video date
3730 result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
3732 self._downloader.to_stderr(u'WARNING: unable to extract video date')
3735 upload_date = result.group('date').strip()
3737 # Get the video uploader
3738 result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
3740 self._downloader.to_stderr(u'WARNING: unable to extract uploader')
3741 video_uploader = None
3743 video_uploader = result.group('uploader').strip()
3744 video_uploader = clean_html( video_uploader )
3746 # Get all of the formats available
3747 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
3748 result = re.search(DOWNLOAD_LIST_RE, webpage)
3750 raise ExtractorError(u'Unable to extract download list')
3751 download_list_html = result.group('download_list').strip()
3753 # Get all of the links from the page
3754 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
3755 links = re.findall(LINK_RE, download_list_html)
3756 if(len(links) == 0):
3757 raise ExtractorError(u'ERROR: no known formats available for video')
3759 self._downloader.to_screen(u'[youporn] Links found: %d' % len(links))
3764 # A link looks like this:
3765 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
3766 # A path looks like this:
3767 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
3768 video_url = unescapeHTML( link )
3769 path = compat_urllib_parse_urlparse( video_url ).path
3770 extension = os.path.splitext( path )[1][1:]
3771 format = path.split('/')[4].split('_')[:2]
3774 format = "-".join( format )
3775 title = u'%s-%s-%s' % (video_title, size, bitrate)
3780 'uploader': video_uploader,
3781 'upload_date': upload_date,
3786 'description': None,
3790 if self._downloader.params.get('listformats', None):
3791 self._print_formats(formats)
3794 req_format = self._downloader.params.get('format', None)
3795 self._downloader.to_screen(u'[youporn] Format: %s' % req_format)
3797 if req_format is None or req_format == 'best':
3799 elif req_format == 'worst':
3800 return [formats[-1]]
3801 elif req_format in ('-1', 'all'):
3804 format = self._specific( req_format, formats )
3806 self._downloader.trouble(u'ERROR: requested format not available')
3812 class PornotubeIE(InfoExtractor):
3813 """Information extractor for pornotube.com."""
3814 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
3816 def _real_extract(self, url):
3817 mobj = re.match(self._VALID_URL, url)
3819 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3822 video_id = mobj.group('videoid')
3823 video_title = mobj.group('title')
3825 # Get webpage content
3826 webpage = self._download_webpage(url, video_id)
3829 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
3830 result = re.search(VIDEO_URL_RE, webpage)
3832 self._downloader.trouble(u'ERROR: unable to extract video url')
3834 video_url = compat_urllib_parse.unquote(result.group('url'))
3836 #Get the uploaded date
3837 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
3838 result = re.search(VIDEO_UPLOADED_RE, webpage)
3840 self._downloader.trouble(u'ERROR: unable to extract video title')
3842 upload_date = result.group('date')
3844 info = {'id': video_id,
3847 'upload_date': upload_date,
3848 'title': video_title,
3854 class YouJizzIE(InfoExtractor):
3855 """Information extractor for youjizz.com."""
3856 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
3858 def _real_extract(self, url):
3859 mobj = re.match(self._VALID_URL, url)
3861 self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
3864 video_id = mobj.group('videoid')
3866 # Get webpage content
3867 webpage = self._download_webpage(url, video_id)
3869 # Get the video title
3870 result = re.search(r'<title>(?P<title>.*)</title>', webpage)
3872 raise ExtractorError(u'ERROR: unable to extract video title')
3873 video_title = result.group('title').strip()
3875 # Get the embed page
3876 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
3878 raise ExtractorError(u'ERROR: unable to extract embed page')
3880 embed_page_url = result.group(0).strip()
3881 video_id = result.group('videoid')
3883 webpage = self._download_webpage(embed_page_url, video_id)
3886 result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
3888 raise ExtractorError(u'ERROR: unable to extract video url')
3889 video_url = result.group('source')
3891 info = {'id': video_id,
3893 'title': video_title,
3896 'player_url': embed_page_url}
3900 class EightTracksIE(InfoExtractor):
3902 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
3904 def _real_extract(self, url):
3905 mobj = re.match(self._VALID_URL, url)
3907 raise ExtractorError(u'Invalid URL: %s' % url)
3908 playlist_id = mobj.group('id')
3910 webpage = self._download_webpage(url, playlist_id)
3912 m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
3914 raise ExtractorError(u'Cannot find trax information')
3915 json_like = m.group(1)
3916 data = json.loads(json_like)
3918 session = str(random.randint(0, 1000000000))
3920 track_count = data['tracks_count']
3921 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
3922 next_url = first_url
3924 for i in itertools.count():
3925 api_json = self._download_webpage(next_url, playlist_id,
3926 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
3927 errnote=u'Failed to download song information')
3928 api_data = json.loads(api_json)
3929 track_data = api_data[u'set']['track']
3931 'id': track_data['id'],
3932 'url': track_data['track_file_stream_url'],
3933 'title': track_data['performer'] + u' - ' + track_data['name'],
3934 'raw_title': track_data['name'],
3935 'uploader_id': data['user']['login'],
3939 if api_data['set']['at_last_track']:
3941 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
3944 class KeekIE(InfoExtractor):
3945 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
3948 def _real_extract(self, url):
3949 m = re.match(self._VALID_URL, url)
3950 video_id = m.group('videoID')
3951 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
3952 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
3953 webpage = self._download_webpage(url, video_id)
3954 m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
3955 title = unescapeHTML(m.group('title'))
3956 m = re.search(r'<div class="bio-names-and-report">[\s\n]+<h4>(?P<uploader>\w+)</h4>', webpage)
3957 uploader = unescapeHTML(m.group('uploader'))
3963 'thumbnail': thumbnail,
3964 'uploader': uploader
3968 class TEDIE(InfoExtractor):
3969 _VALID_URL=r'''http://www.ted.com/
3971 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
3973 ((?P<type_talk>talks)) # We have a simple talk
3975 /(?P<name>\w+) # Here goes the name and then ".html"
3978 def suitable(self, url):
3979 """Receives a URL and returns True if suitable for this IE."""
3980 return re.match(self._VALID_URL, url, re.VERBOSE) is not None
3982 def _real_extract(self, url):
3983 m=re.match(self._VALID_URL, url, re.VERBOSE)
3984 if m.group('type_talk'):
3985 return [self._talk_info(url)]
3987 playlist_id=m.group('playlist_id')
3988 name=m.group('name')
3989 self._downloader.to_screen(u'[%s] Getting info of playlist %s: "%s"' % (self.IE_NAME,playlist_id,name))
3990 return self._playlist_videos_info(url,name,playlist_id)
3992 def _talk_video_link(self,mediaSlug):
3993 '''Returns the video link for that mediaSlug'''
3994 return 'http://download.ted.com/talks/%s.mp4' % mediaSlug
3996 def _playlist_videos_info(self,url,name,playlist_id=0):
3997 '''Returns the videos of the playlist'''
3999 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
4000 ([.\s]*?)data-playlist_item_id="(\d+)"
4001 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
4003 video_name_RE=r'<p\ class="talk-title"><a href="/talks/(.+).html">(?P<fullname>.+?)</a></p>'
4004 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
4005 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
4006 m_names=re.finditer(video_name_RE,webpage)
4008 for m_video, m_name in zip(m_videos,m_names):
4010 'id': m_video.group('video_id'),
4011 'url': self._talk_video_link(m_video.group('mediaSlug')),
4013 'title': m_name.group('fullname')
4015 info.append(video_dic)
4017 def _talk_info(self, url, video_id=0):
4018 """Return the video for the talk in the url"""
4019 m=re.match(self._VALID_URL, url,re.VERBOSE)
4020 videoName=m.group('name')
4021 webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
4022 # If the url includes the language we get the title translated
4023 title_RE=r'<h1><span id="altHeadline" >(?P<title>[\s\w:/\.\?=\+-\\\']*)</span></h1>'
4024 title=re.search(title_RE, webpage).group('title')
4025 info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
4026 "id":(?P<videoID>[\d]+).*?
4027 "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"'''
4028 info_match=re.search(info_RE,webpage,re.VERBOSE)
4029 video_id=info_match.group('videoID')
4030 mediaSlug=info_match.group('mediaSlug')
4031 video_url=self._talk_video_link(mediaSlug)
4040 class MySpassIE(InfoExtractor):
4041 _VALID_URL = r'http://www.myspass.de/.*'
4043 def _real_extract(self, url):
4044 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
4046 # video id is the last path element of the URL
4047 # usually there is a trailing slash, so also try the second but last
4048 url_path = compat_urllib_parse_urlparse(url).path
4049 url_parent_path, video_id = os.path.split(url_path)
4051 _, video_id = os.path.split(url_parent_path)
4054 metadata_url = META_DATA_URL_TEMPLATE % video_id
4055 metadata_text = self._download_webpage(metadata_url, video_id)
4056 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
4058 # extract values from metadata
4059 url_flv_el = metadata.find('url_flv')
4060 if url_flv_el is None:
4061 self._downloader.trouble(u'ERROR: unable to extract download url')
4063 video_url = url_flv_el.text
4064 extension = os.path.splitext(video_url)[1][1:]
4065 title_el = metadata.find('title')
4066 if title_el is None:
4067 self._downloader.trouble(u'ERROR: unable to extract title')
4069 title = title_el.text
4070 format_id_el = metadata.find('format_id')
4071 if format_id_el is None:
4074 format = format_id_el.text
4075 description_el = metadata.find('description')
4076 if description_el is not None:
4077 description = description_el.text
4080 imagePreview_el = metadata.find('imagePreview')
4081 if imagePreview_el is not None:
4082 thumbnail = imagePreview_el.text
4091 'thumbnail': thumbnail,
4092 'description': description
4096 def gen_extractors():
4097 """ Return a list of an instance of every supported extractor.
4098 The order does matter; the first extractor matched is the one handling the URL.
4101 YoutubePlaylistIE(),
4125 StanfordOpenClassroomIE(),