from __future__ import absolute_import
+import base64
import datetime
import netrc
import os
Information extractors are the classes that, given a URL, extract
information about the video (or videos) the URL refers to. This
information includes the real video URL, the video title, author and
- others. The information is stored in a dictionary which is then
+ others. The information is stored in a dictionary which is then
passed to the FileDownloader. The FileDownloader processes this
information possibly downloading the video to the file system, among
other possible outcomes.
id: Video identifier.
url: Final video URL.
- uploader: Nickname of the video uploader, unescaped.
- upload_date: Video upload date (YYYYMMDD).
title: Video title, unescaped.
ext: Video filename extension.
+ uploader: Full name of the video uploader.
+ upload_date: Video upload date (YYYYMMDD).
The following fields are optional:
format: The video format, defaults to ext (used for --get-format)
thumbnail: Full URL to a video thumbnail image.
description: One-line video description.
+ uploader_id: Nickname or id of the video uploader.
player_url: SWF Player URL (used for rtmpdump).
subtitles: The .srt file contents.
urlhandle: [internal] The urlHandle to be used to download the file,
"""Real extraction process. Redefine in subclasses."""
pass
+ @property
+ def IE_NAME(self):
+ return type(self).__name__[:-2]
+
+ def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
+ if note is None:
+ note = u'Downloading video webpage'
+ self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
+ try:
+ urlh = compat_urllib_request.urlopen(url_or_request)
+ webpage_bytes = urlh.read()
+ return webpage_bytes.decode('utf-8', 'replace')
+ except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+ if errnote is None:
+ errnote = u'Unable to download webpage'
+ raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
+
class YoutubeIE(InfoExtractor):
"""Information extractor for youtube.com."""
|(?: # or the v= param in all its forms
(?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
(?:\?|\#!?) # the params delimiter ? or # or #!
- (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
+ (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
v=
)
)? # optional -> youtube.com/xxxx is OK
'44': '480x854',
'45': '720x1280',
'46': '1080x1920',
- }
+ }
IE_NAME = u'youtube'
def suitable(self, url):
srt += caption + '\n\n'
return srt
+ def _extract_subtitles(self, video_id):
+ self.report_video_subtitles_download(video_id)
+ request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
+ try:
+ srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
+ except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+ return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
+ srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
+ srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
+ if not srt_lang_list:
+ return (u'WARNING: video has no closed captions', None)
+ if self._downloader.params.get('subtitleslang', False):
+ srt_lang = self._downloader.params.get('subtitleslang')
+ elif 'en' in srt_lang_list:
+ srt_lang = 'en'
+ else:
+ srt_lang = list(srt_lang_list.keys())[0]
+ if not srt_lang in srt_lang_list:
+ return (u'WARNING: no closed captions found in the specified language', None)
+ request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
+ try:
+ srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
+ except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+ return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
+ if not srt_xml:
+ return (u'WARNING: unable to download video subtitles', None)
+ return (None, self._closed_captions_xml_to_srt(srt_xml))
+
def _print_formats(self, formats):
print('Available formats:')
for x in formats:
request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
try:
self.report_login()
- login_results = compat_urllib_request.urlopen(request).read()
+ login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
if re.search(r'(?i)<form[^>]* name="loginForm"', login_results) is not None:
self._downloader.to_stderr(u'WARNING: unable to log in: bad username or password')
return
request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
try:
self.report_age_confirmation()
- age_results = compat_urllib_request.urlopen(request).read()
+ age_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
return
- def _real_extract(self, url):
- # Extract original video URL from URL with redirection, like age verification, using next_url parameter
- mobj = re.search(self._NEXT_URL_RE, url)
- if mobj:
- url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
-
- # Extract video id from URL
+ def _extract_id(self, url):
mobj = re.match(self._VALID_URL, url, re.VERBOSE)
if mobj is None:
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
return
video_id = mobj.group(2)
+ return video_id
+
+ def _real_extract(self, url):
+ # Extract original video URL from URL with redirection, like age verification, using next_url parameter
+ mobj = re.search(self._NEXT_URL_RE, url)
+ if mobj:
+ url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
+ video_id = self._extract_id(url)
# Get video webpage
self.report_video_webpage_download(video_id)
- request = compat_urllib_request.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
+ url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
+ request = compat_urllib_request.Request(url)
try:
video_webpage_bytes = compat_urllib_request.urlopen(request).read()
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
# uploader
if 'author' not in video_info:
- self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
+ self._downloader.trouble(u'ERROR: unable to extract uploader name')
return
video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
+ # uploader_id
+ video_uploader_id = None
+ mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
+ if mobj is not None:
+ video_uploader_id = mobj.group(1)
+ else:
+ self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
+
# title
if 'title' not in video_info:
self._downloader.trouble(u'ERROR: unable to extract video title')
# closed captions
video_subtitles = None
if self._downloader.params.get('writesubtitles', False):
- try:
- self.report_video_subtitles_download(video_id)
- request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
- try:
- srt_list = compat_urllib_request.urlopen(request).read()
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
- srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
- srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
- if not srt_lang_list:
- raise Trouble(u'WARNING: video has no closed captions')
- if self._downloader.params.get('subtitleslang', False):
- srt_lang = self._downloader.params.get('subtitleslang')
- elif 'en' in srt_lang_list:
- srt_lang = 'en'
- else:
- srt_lang = srt_lang_list.keys()[0]
- if not srt_lang in srt_lang_list:
- raise Trouble(u'WARNING: no closed captions found in the specified language')
- request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
- try:
- srt_xml = compat_urllib_request.urlopen(request).read()
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
- if not srt_xml:
- raise Trouble(u'WARNING: unable to download video subtitles')
- video_subtitles = self._closed_captions_xml_to_srt(srt_xml.decode('utf-8'))
- except Trouble as trouble:
- self._downloader.trouble(trouble[0])
+ (srt_error, video_subtitles) = self._extract_subtitles(video_id)
+ if srt_error:
+ self._downloader.trouble(srt_error)
if 'length_seconds' not in video_info:
self._downloader.trouble(u'WARNING: unable to extract video duration')
elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
url_data = [compat_parse_qs(uds) for uds in url_data_strs]
- url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
+ url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
format_limit = self._downloader.params.get('format_limit', None)
'id': video_id,
'url': video_real_url,
'uploader': video_uploader,
+ 'uploader_id': video_uploader_id,
'upload_date': upload_date,
'title': video_title,
'ext': video_extension,
def __init__(self, downloader=None):
InfoExtractor.__init__(self, downloader)
- def report_download_webpage(self, video_id):
- """Report webpage download."""
- self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
-
def report_extraction(self, video_id):
"""Report information extraction."""
self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
# Retrieve video webpage to extract further information
request = compat_urllib_request.Request(url)
request.add_header('Cookie', 'family_filter=off')
- try:
- self.report_download_webpage(video_id)
- webpage_bytes = compat_urllib_request.urlopen(request).read()
- webpage = webpage_bytes.decode('utf-8')
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
- return
+ webpage = self._download_webpage(request, video_id)
# Extract URL, uploader and title from webpage
self.report_extraction(video_id)
}]
-class GoogleIE(InfoExtractor):
- """Information extractor for video.google.com."""
-
- _VALID_URL = r'(?:http://)?video\.google\.(?:com(?:\.au)?|co\.(?:uk|jp|kr|cr)|ca|de|es|fr|it|nl|pl)/videoplay\?docid=([^\&]+).*'
- IE_NAME = u'video.google'
-
- def __init__(self, downloader=None):
- InfoExtractor.__init__(self, downloader)
-
- def report_download_webpage(self, video_id):
- """Report webpage download."""
- self._downloader.to_screen(u'[video.google] %s: Downloading webpage' % video_id)
-
- def report_extraction(self, video_id):
- """Report information extraction."""
- self._downloader.to_screen(u'[video.google] %s: Extracting information' % video_id)
-
- def _real_extract(self, url):
- # Extract id from URL
- mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
- return
-
- video_id = mobj.group(1)
-
- video_extension = 'mp4'
-
- # Retrieve video webpage to extract further information
- request = compat_urllib_request.Request('http://video.google.com/videoplay?docid=%s&hl=en&oe=utf-8' % video_id)
- try:
- self.report_download_webpage(video_id)
- webpage = compat_urllib_request.urlopen(request).read()
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
- return
-
- # Extract URL, uploader, and title from webpage
- self.report_extraction(video_id)
- mobj = re.search(r"download_url:'([^']+)'", webpage)
- if mobj is None:
- video_extension = 'flv'
- mobj = re.search(r"(?i)videoUrl\\x3d(.+?)\\x26", webpage)
- if mobj is None:
- self._downloader.trouble(u'ERROR: unable to extract media URL')
- return
- mediaURL = compat_urllib_parse.unquote(mobj.group(1))
- mediaURL = mediaURL.replace('\\x3d', '\x3d')
- mediaURL = mediaURL.replace('\\x26', '\x26')
-
- video_url = mediaURL
-
- mobj = re.search(r'<title>(.*)</title>', webpage)
- if mobj is None:
- self._downloader.trouble(u'ERROR: unable to extract title')
- return
- video_title = mobj.group(1).decode('utf-8')
-
- # Extract video description
- mobj = re.search(r'<span id=short-desc-content>([^<]*)</span>', webpage)
- if mobj is None:
- self._downloader.trouble(u'ERROR: unable to extract video description')
- return
- video_description = mobj.group(1).decode('utf-8')
- if not video_description:
- video_description = 'No description available.'
-
- # Extract video thumbnail
- if self._downloader.params.get('forcethumbnail', False):
- request = compat_urllib_request.Request('http://video.google.com/videosearch?q=%s+site:video.google.com&hl=en' % abs(int(video_id)))
- try:
- webpage = compat_urllib_request.urlopen(request).read()
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
- return
- mobj = re.search(r'<img class=thumbnail-img (?:.* )?src=(http.*)>', webpage)
- if mobj is None:
- self._downloader.trouble(u'ERROR: unable to extract video thumbnail')
- return
- video_thumbnail = mobj.group(1)
- else: # we need something to pass to process_info
- video_thumbnail = ''
-
- return [{
- 'id': video_id.decode('utf-8'),
- 'url': video_url.decode('utf-8'),
- 'uploader': None,
- 'upload_date': None,
- 'title': video_title,
- 'ext': video_extension.decode('utf-8'),
- }]
-
-
class PhotobucketIE(InfoExtractor):
"""Information extractor for photobucket.com."""
class YahooIE(InfoExtractor):
"""Information extractor for video.yahoo.com."""
+ _WORKING = False
# _VALID_URL matches all Yahoo! Video URLs
# _VPAGE_URL matches only the extractable '/watch/' URLs
_VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
except:
self._downloader.trouble(u'ERROR: unable to extract info section')
return
-
+
# Extract title
video_title = config["video"]["title"]
- # Extract uploader
+ # Extract uploader and uploader_id
video_uploader = config["video"]["owner"]["name"]
+ video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
# Extract video thumbnail
video_thumbnail = config["video"]["thumbnail"]
# Extract video description
- video_description = get_element_by_id("description", webpage)
+ video_description = get_element_by_attribute("itemprop", "description", webpage)
if video_description: video_description = clean_html(video_description)
else: video_description = ''
# Extract upload date
video_upload_date = None
- mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
+ mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
if mobj is not None:
- video_upload_date = mobj.group(1)
+ video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
# Vimeo specific: extract request signature and timestamp
sig = config['request']['signature']
'id': video_id,
'url': video_url,
'uploader': video_uploader,
+ 'uploader_id': video_uploader_id,
'upload_date': video_upload_date,
'title': video_title,
'ext': video_extension,
self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
def fetch_webpage(self, url):
- self._downloader.increment_downloads()
request = compat_urllib_request.Request(url)
try:
self.report_download_webpage(url)
'url': compat_urllib_parse.unquote(info.get('url')),
'uploader': u'arte.tv',
'upload_date': info.get('date'),
- 'title': info.get('title'),
+ 'title': info.get('title').decode('utf-8'),
'ext': u'mp4',
'format': u'NA',
'player_url': None,
def report_following_redirect(self, new_url):
"""Report information extraction."""
self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
-
+
def _test_redirect(self, url):
"""Check if it is a redirect, like url shorteners, in case restart chain."""
class HeadRequest(compat_urllib_request.Request):
class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
"""
- Subclass the HTTPRedirectHandler to make it use our
+ Subclass the HTTPRedirectHandler to make it use our
HeadRequest also on the redirected URL
"""
- def redirect_request(self, req, fp, code, msg, headers, newurl):
+ def redirect_request(self, req, fp, code, msg, headers, newurl):
if code in (301, 302, 303, 307):
- newurl = newurl.replace(' ', '%20')
+ newurl = newurl.replace(' ', '%20')
newheaders = dict((k,v) for k,v in req.headers.items()
if k.lower() not in ("content-length", "content-type"))
- return HeadRequest(newurl,
+ return HeadRequest(newurl,
headers=newheaders,
- origin_req_host=req.get_origin_req_host(),
- unverifiable=True)
- else:
- raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
+ origin_req_host=req.get_origin_req_host(),
+ unverifiable=True)
+ else:
+ raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
class HTTPMethodFallback(compat_urllib_request.BaseHandler):
"""
Fallback to GET if HEAD is not allowed (405 HTTP error)
"""
- def http_error_405(self, req, fp, code, msg, headers):
+ def http_error_405(self, req, fp, code, msg, headers):
fp.read()
fp.close()
newheaders = dict((k,v) for k,v in req.headers.items()
if k.lower() not in ("content-length", "content-type"))
- return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
- headers=newheaders,
- origin_req_host=req.get_origin_req_host(),
+ return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
+ headers=newheaders,
+ origin_req_host=req.get_origin_req_host(),
unverifiable=True))
# Build our opener
- opener = compat_urllib_request.OpenerDirector()
+ opener = compat_urllib_request.OpenerDirector()
for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
HTTPMethodFallback, HEADRedirectHandler,
compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
class YahooSearchIE(InfoExtractor):
"""Information Extractor for Yahoo! Video search queries."""
+
+ _WORKING = False
_VALID_URL = r'yvsearch(\d+|all)?:[\s\S]+'
_TEMPLATE_URL = 'http://video.yahoo.com/search/?p=%s&o=%s'
_VIDEO_INDICATOR = r'href="http://video\.yahoo\.com/watch/([0-9]+/[0-9]+)"'
url = self._TEMPLATE_URL % (playlist_access, playlist_prefix, playlist_id, pagenum)
request = compat_urllib_request.Request(url)
try:
- page = compat_urllib_request.urlopen(request).read().decode('utf8')
+ page = compat_urllib_request.urlopen(request).read().decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
return
request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
try:
- page = compat_urllib_request.urlopen(request).read()
+ page = compat_urllib_request.urlopen(request).read().decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
return
"""Information extractor for depositfiles.com"""
_VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
- IE_NAME = u'DepositFiles'
-
- def __init__(self, downloader=None):
- InfoExtractor.__init__(self, downloader)
def report_download_webpage(self, file_id):
"""Report webpage download."""
video_description = video_info.get('description', 'No description available.')
url_map = video_info['video_urls']
- if len(url_map.keys()) > 0:
+ if url_map:
# Decide which formats to download
req_format = self._downloader.params.get('format', None)
format_limit = self._downloader.params.get('format_limit', None)
def __init__(self, downloader=None):
InfoExtractor.__init__(self, downloader)
-
- def report_download_webpage(self, video_id):
- """Report webpage download."""
- self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
def report_extraction(self, video_id):
"""Report information extraction."""
video_id = mobj.group(1)
# Get video webpage
- request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
- try:
- self.report_download_webpage(video_id)
- webpage = compat_urllib_request.urlopen(request).read()
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
- return
+ webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
+ webpage = self._download_webpage(webpage_url, video_id)
self.report_extraction(video_id)
mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
"""Information extractor for The Daily Show and Colbert Report """
# urls can be abbreviations like :thedailyshow or :colbert
- # urls for episodes like:
+ # urls for episodes like:
# or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
# or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
- # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
+ # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
_VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
|(https?://)?(www\.)?
(?P<showname>thedailyshow|colbertnation)\.com/
(?P<clip>
(the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
|(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
- $"""
+ $"""
IE_NAME = u'comedycentral'
_available_formats = ['3500', '2200', '1700', '1200', '750', '400']
return
else:
mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
-
+
playerUrl_raw = mMovieParams[0][0]
self.report_player_url(epTitle)
try:
if len(turls) == 0:
self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
continue
-
+
if self._downloader.params.get('listformats', None):
self._print_formats([i[0] for i in turls])
return
}
results.append(info)
-
+
return results
self.report_config_download(showName)
try:
- configJSON = compat_urllib_request.urlopen(configUrl).read()
+ configJSON = compat_urllib_request.urlopen(configUrl)
+ m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
+ configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
return
_VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
IE_NAME = u'xvideos'
- def report_webpage(self, video_id):
- """Report information extraction."""
- self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
-
def report_extraction(self, video_id):
"""Report information extraction."""
self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
return
video_id = mobj.group(1)
- self.report_webpage(video_id)
-
- request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
- try:
- webpage_bytes = compat_urllib_request.urlopen(request).read()
- webpage = webpage_bytes.decode('utf-8', 'replace')
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
- return
+ webpage = self._download_webpage(url, video_id)
self.report_extraction(video_id)
stream_json_bytes = compat_urllib_request.urlopen(request).read()
stream_json = stream_json_bytes.decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
+ self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
return
streams = json.loads(stream_json)
class InfoQIE(InfoExtractor):
"""Information extractor for infoq.com"""
-
_VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
- IE_NAME = u'infoq'
-
- def report_webpage(self, video_id):
- """Report information extraction."""
- self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
def report_extraction(self, video_id):
"""Report information extraction."""
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
return
- self.report_webpage(url)
-
- request = compat_urllib_request.Request(url)
- try:
- webpage = compat_urllib_request.urlopen(request).read()
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
- return
-
+ webpage = self._download_webpage(url, video_id=url)
self.report_extraction(url)
-
# Extract video URL
mobj = re.search(r"jsclassref='([^']*)'", webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract video url')
return
- video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
-
+ real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
+ video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
# Extract title
mobj = re.search(r'contentTitle = "(.*?)";', webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract video title')
return
- video_title = mobj.group(1).decode('utf-8')
+ video_title = mobj.group(1)
# Extract description
video_description = u'No description available.'
mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
if mobj is not None:
- video_description = mobj.group(1).decode('utf-8')
+ video_description = mobj.group(1)
video_filename = video_url.split('/')[-1]
video_id, extension = video_filename.split('.')
class MixcloudIE(InfoExtractor):
"""Information extractor for www.mixcloud.com"""
+
+ _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
_VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
IE_NAME = u'mixcloud'
if file_url is not None:
break # got it!
else:
- if req_format not in formats.keys():
+ if req_format not in formats:
self._downloader.trouble(u'ERROR: format is not available')
return
assert entry['type'] == 'reference'
results += self.extract(entry['url'])
return results
-
+
else: # Root page
info = {
'id': 'Stanford OpenClassroom',
_VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
IE_NAME = u'mtv'
- def report_webpage(self, video_id):
- """Report information extraction."""
- self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
-
def report_extraction(self, video_id):
"""Report information extraction."""
self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
if not mobj.group('proto'):
url = 'http://' + url
video_id = mobj.group('videoid')
- self.report_webpage(video_id)
- request = compat_urllib_request.Request(url)
- try:
- webpage = compat_urllib_request.urlopen(request).read()
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
- return
+ webpage = self._download_webpage(url, video_id)
mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract performer')
return
performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
- video_title = performer + ' - ' + song_name
+ video_title = performer + ' - ' + song_name
mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
if mobj is None:
class YoukuIE(InfoExtractor):
-
_VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
- IE_NAME = u'Youku'
-
- def __init__(self, downloader=None):
- InfoExtractor.__init__(self, downloader)
def report_download_webpage(self, file_id):
"""Report webpage download."""
- self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
+ self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
def report_extraction(self, file_id):
"""Report information extraction."""
- self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
+ self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
def _gen_sid(self):
nowTime = int(time.time() * 1000)
seed = config['data'][0]['seed']
format = self._downloader.params.get('format', None)
- supported_format = config['data'][0]['streamfileids'].keys()
+ supported_format = list(config['data'][0]['streamfileids'].keys())
if format is None or format == 'best':
if 'hd2' in supported_format:
class GooglePlusIE(InfoExtractor):
"""Information extractor for plus.google.com."""
- _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
+ _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
IE_NAME = u'plus.google'
def __init__(self, downloader=None):
def report_extract_entry(self, url):
"""Report downloading extry"""
- self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
+ self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
def report_date(self, upload_date):
"""Report downloading extry"""
def report_uploader(self, uploader):
"""Report downloading extry"""
- self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
+ self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
def report_title(self, video_title):
"""Report downloading extry"""
- self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
+ self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
def report_extract_vid_page(self, video_page):
"""Report information extraction."""
- self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
+ self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
def _real_extract(self, url):
# Extract id from URL
return
post_url = mobj.group(0)
- video_id = mobj.group(2)
+ video_id = mobj.group(1)
video_extension = 'flv'
self.report_extract_entry(post_url)
request = compat_urllib_request.Request(post_url)
try:
- webpage = compat_urllib_request.urlopen(request).read()
+ webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
return
video_page = mobj.group(1)
request = compat_urllib_request.Request(video_page)
try:
- webpage = compat_urllib_request.urlopen(request).read()
+ webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
return
# Only get the url. The resolution part in the tuple has no use anymore
video_url = video_url[-1]
# Treat escaped \u0026 style hex
- video_url = unicode(video_url, "unicode_escape")
+ try:
+ video_url = video_url.decode("unicode_escape")
+ except AttributeError: # Python 3
+ video_url = bytes(video_url, 'ascii').decode('unicode-escape')
return [{
- 'id': video_id.decode('utf-8'),
+ 'id': video_id,
'url': video_url,
- 'uploader': uploader.decode('utf-8'),
- 'upload_date': upload_date.decode('utf-8'),
- 'title': video_title.decode('utf-8'),
- 'ext': video_extension.decode('utf-8'),
+ 'uploader': uploader,
+ 'upload_date': upload_date,
+ 'title': video_title,
+ 'ext': video_extension,
}]
class NBAIE(InfoExtractor):
_VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
IE_NAME = u'nba'
- def report_extraction(self, video_id):
- self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
-
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
if video_id.endswith('/index.html'):
video_id = video_id[:-len('/index.html')]
- self.report_extraction(video_id)
- try:
- urlh = compat_urllib_request.urlopen(url)
- webpage_bytes = urlh.read()
- webpage = webpage_bytes.decode('utf-8', 'ignore')
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
- return
+ webpage = self._download_webpage(url, video_id)
video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
def _findProp(rexp, default=None):
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
return
-
+
response = json.loads(webpage)
info = []
for clip in response:
if mobj is None:
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
return
-
+
api = 'http://api.justin.tv'
video_id = mobj.group(mobj.lastindex)
paged = False
else:
api += '/clip/show/%s.json'
api = api % (video_id,)
-
+
self.report_extraction(video_id)
-
+
info = []
offset = 0
limit = self._JUSTIN_PAGE_LIMIT
break
offset += limit
return info
+
+class FunnyOrDieIE(InfoExtractor):
+ _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
+ return
+
+ video_id = mobj.group('id')
+ webpage = self._download_webpage(url, video_id)
+
+ m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
+ if not m:
+ self._downloader.trouble(u'ERROR: unable to find video information')
+ video_url = unescapeHTML(m.group('url'))
+
+ m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
+ if not m:
+ self._downloader.trouble(u'Cannot find video title')
+ title = unescapeHTML(m.group('title'))
+
+ m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
+ if m:
+ desc = unescapeHTML(m.group('desc'))
+ else:
+ desc = None
+
+ info = {
+ 'id': video_id,
+ 'url': video_url,
+ 'ext': 'mp4',
+ 'title': title,
+ 'description': desc,
+ }
+ return [info]
+
+class TweetReelIE(InfoExtractor):
+ _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
+ return
+
+ video_id = mobj.group('id')
+ webpage = self._download_webpage(url, video_id)
+
+ m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
+ if not m:
+ self._downloader.trouble(u'ERROR: Cannot find status ID')
+ status_id = m.group(1)
+
+ m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
+ if not m:
+ self._downloader.trouble(u'WARNING: Cannot find description')
+ desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
+
+ m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
+ if not m:
+ self._downloader.trouble(u'ERROR: Cannot find uploader')
+ uploader = unescapeHTML(m.group('uploader'))
+ uploader_id = unescapeHTML(m.group('uploader_id'))
+
+ m = re.search(r'<span unixtime="([0-9]+)"', webpage)
+ if not m:
+ self._downloader.trouble(u'ERROR: Cannot find upload date')
+ upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
+
+ title = desc
+ video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
+
+ info = {
+ 'id': video_id,
+ 'url': video_url,
+ 'ext': 'mov',
+ 'title': title,
+ 'description': desc,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'internal_id': status_id,
+ 'upload_date': upload_date
+ }
+ return [info]
+
+class SteamIE(InfoExtractor):
+ _VALID_URL = r"""http://store.steampowered.com/
+ (?P<urltype>video|app)/ #If the page is only for videos or for a game
+ (?P<gameID>\d+)/?
+ (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
+ """
+
+ def suitable(self, url):
+ """Receives a URL and returns True if suitable for this IE."""
+ return re.match(self._VALID_URL, url, re.VERBOSE) is not None
+
+ def _real_extract(self, url):
+ m = re.match(self._VALID_URL, url, re.VERBOSE)
+ urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
+ gameID = m.group('gameID')
+ videourl = 'http://store.steampowered.com/video/%s/' % gameID
+ webpage = self._download_webpage(videourl, gameID)
+ mweb = re.finditer(urlRE, webpage)
+ namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
+ titles = re.finditer(namesRE, webpage)
+ videos = []
+ for vid,vtitle in zip(mweb,titles):
+ video_id = vid.group('videoID')
+ title = vtitle.group('videoName')
+ video_url = vid.group('videoURL')
+ if not video_url:
+ self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
+ info = {
+ 'id':video_id,
+ 'url':video_url,
+ 'ext': 'flv',
+ 'title': unescapeHTML(title)
+ }
+ videos.append(info)
+ return videos
+
+class UstreamIE(InfoExtractor):
+ _VALID_URL = r'http://www.ustream.tv/recorded/(?P<videoID>\d+)'
+ IE_NAME = u'ustream'
+
+ def _real_extract(self, url):
+ m = re.match(self._VALID_URL, url)
+ video_id = m.group('videoID')
+ video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
+ webpage = self._download_webpage(url, video_id)
+ m = re.search(r'data-title="(?P<title>.+)"',webpage)
+ title = m.group('title')
+ m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
+ uploader = m.group('uploader')
+ info = {
+ 'id':video_id,
+ 'url':video_url,
+ 'ext': 'flv',
+ 'title': title,
+ 'uploader': uploader
+ }
+ return [info]
+
+
+def gen_extractors():
+ """ Return a list of an instance of every supported extractor.
+ The order does matter; the first extractor matched is the one handling the URL.
+ """
+ return [
+ YoutubePlaylistIE(),
+ YoutubeChannelIE(),
+ YoutubeUserIE(),
+ YoutubeSearchIE(),
+ YoutubeIE(),
+ MetacafeIE(),
+ DailymotionIE(),
+ GoogleSearchIE(),
+ PhotobucketIE(),
+ YahooIE(),
+ YahooSearchIE(),
+ DepositFilesIE(),
+ FacebookIE(),
+ BlipTVUserIE(),
+ BlipTVIE(),
+ VimeoIE(),
+ MyVideoIE(),
+ ComedyCentralIE(),
+ EscapistIE(),
+ CollegeHumorIE(),
+ XVideosIE(),
+ SoundcloudIE(),
+ InfoQIE(),
+ MixcloudIE(),
+ StanfordOpenClassroomIE(),
+ MTVIE(),
+ YoukuIE(),
+ XNXXIE(),
+ GooglePlusIE(),
+ ArteTvIE(),
+ NBAIE(),
+ JustinTVIE(),
+ FunnyOrDieIE(),
+ TweetReelIE(),
+ SteamIE(),
+ UstreamIE(),
+ GenericIE()
+ ]
+
+