from __future__ import absolute_import
+import base64
import datetime
import netrc
import os
id: Video identifier.
url: Final video URL.
- uploader: Full name of the video uploader, unescaped.
- upload_date: Video upload date (YYYYMMDD).
title: Video title, unescaped.
ext: Video filename extension.
+ uploader: Full name of the video uploader.
+ upload_date: Video upload date (YYYYMMDD).
The following fields are optional:
"""Real extraction process. Redefine in subclasses."""
pass
+ @property
+ def IE_NAME(self):
+ return type(self).__name__[:-2]
+
+ def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
+ if note is None:
+ note = u'Downloading video webpage'
+ self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
+ try:
+ urlh = compat_urllib_request.urlopen(url_or_request)
+ webpage_bytes = urlh.read()
+ return webpage_bytes.decode('utf-8', 'replace')
+ except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+ if errnote is None:
+ errnote = u'Unable to download webpage'
+ raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
+
class YoutubeIE(InfoExtractor):
"""Information extractor for youtube.com."""
|(?: # or the v= param in all its forms
(?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
(?:\?|\#!?) # the params delimiter ? or # or #!
- (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
+ (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
v=
)
)? # optional -> youtube.com/xxxx is OK
self._downloader.trouble(u'ERROR: unable to confirm age: %s' % compat_str(err))
return
- def _real_extract(self, url):
- # Extract original video URL from URL with redirection, like age verification, using next_url parameter
- mobj = re.search(self._NEXT_URL_RE, url)
- if mobj:
- url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
-
- # Extract video id from URL
+ def _extract_id(self, url):
mobj = re.match(self._VALID_URL, url, re.VERBOSE)
if mobj is None:
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
return
video_id = mobj.group(2)
+ return video_id
+
+ def _real_extract(self, url):
+ # Extract original video URL from URL with redirection, like age verification, using next_url parameter
+ mobj = re.search(self._NEXT_URL_RE, url)
+ if mobj:
+ url = 'http://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
+ video_id = self._extract_id(url)
# Get video webpage
self.report_video_webpage_download(video_id)
- request = compat_urllib_request.Request('http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id)
+ url = 'http://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
+ request = compat_urllib_request.Request(url)
try:
video_webpage_bytes = compat_urllib_request.urlopen(request).read()
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
# uploader_id
video_uploader_id = None
- mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/user/([^"]+)">', video_webpage)
+ mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
if mobj is not None:
video_uploader_id = mobj.group(1)
else:
def __init__(self, downloader=None):
InfoExtractor.__init__(self, downloader)
- def report_download_webpage(self, video_id):
- """Report webpage download."""
- self._downloader.to_screen(u'[dailymotion] %s: Downloading webpage' % video_id)
-
def report_extraction(self, video_id):
"""Report information extraction."""
self._downloader.to_screen(u'[dailymotion] %s: Extracting information' % video_id)
# Retrieve video webpage to extract further information
request = compat_urllib_request.Request(url)
request.add_header('Cookie', 'family_filter=off')
- try:
- self.report_download_webpage(video_id)
- webpage_bytes = compat_urllib_request.urlopen(request).read()
- webpage = webpage_bytes.decode('utf-8')
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.trouble(u'ERROR: unable retrieve video webpage: %s' % compat_str(err))
- return
+ webpage = self._download_webpage(request, video_id)
# Extract URL, uploader and title from webpage
self.report_extraction(video_id)
# Extract upload date
video_upload_date = None
- mobj = re.search(r'<span id="clip-date" style="display:none">[^:]*: (.*?)( \([^\(]*\))?</span>', webpage)
+ mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
if mobj is not None:
- video_upload_date = mobj.group(1)
+ video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
# Vimeo specific: extract request signature and timestamp
sig = config['request']['signature']
self._downloader.to_screen(u'[arte.tv] %s: Extracting information' % video_id)
def fetch_webpage(self, url):
- self._downloader.increment_downloads()
request = compat_urllib_request.Request(url)
try:
self.report_download_webpage(url)
"""Information extractor for depositfiles.com"""
_VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
- IE_NAME = u'DepositFiles'
-
- def __init__(self, downloader=None):
- InfoExtractor.__init__(self, downloader)
def report_download_webpage(self, file_id):
"""Report webpage download."""
video_description = video_info.get('description', 'No description available.')
url_map = video_info['video_urls']
- if len(list(url_map.keys())) > 0:
+ if url_map:
# Decide which formats to download
req_format = self._downloader.params.get('format', None)
format_limit = self._downloader.params.get('format_limit', None)
def __init__(self, downloader=None):
InfoExtractor.__init__(self, downloader)
- def report_download_webpage(self, video_id):
- """Report webpage download."""
- self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
-
def report_extraction(self, video_id):
"""Report information extraction."""
self._downloader.to_screen(u'[myvideo] %s: Extracting information' % video_id)
video_id = mobj.group(1)
# Get video webpage
- request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
- try:
- self.report_download_webpage(video_id)
- webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
- return
+ webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
+ webpage = self._download_webpage(webpage_url, video_id)
self.report_extraction(video_id)
mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
_VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
IE_NAME = u'xvideos'
- def report_webpage(self, video_id):
- """Report information extraction."""
- self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
-
def report_extraction(self, video_id):
"""Report information extraction."""
self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
return
video_id = mobj.group(1)
- self.report_webpage(video_id)
-
- request = compat_urllib_request.Request(r'http://www.xvideos.com/video' + video_id)
- try:
- webpage_bytes = compat_urllib_request.urlopen(request).read()
- webpage = webpage_bytes.decode('utf-8', 'replace')
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
- return
+ webpage = self._download_webpage(url, video_id)
self.report_extraction(video_id)
stream_json_bytes = compat_urllib_request.urlopen(request).read()
stream_json = stream_json_bytes.decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
+ self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
return
streams = json.loads(stream_json)
class InfoQIE(InfoExtractor):
"""Information extractor for infoq.com"""
-
_VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
- IE_NAME = u'infoq'
-
- def report_webpage(self, video_id):
- """Report information extraction."""
- self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
def report_extraction(self, video_id):
"""Report information extraction."""
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
return
- self.report_webpage(url)
-
- request = compat_urllib_request.Request(url)
- try:
- webpage = compat_urllib_request.urlopen(request).read()
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
- return
-
+ webpage = self._download_webpage(url, video_id=url)
self.report_extraction(url)
-
# Extract video URL
mobj = re.search(r"jsclassref='([^']*)'", webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract video url')
return
- video_url = 'rtmpe://video.infoq.com/cfx/st/' + compat_urllib_parse.unquote(mobj.group(1).decode('base64'))
-
+ real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
+ video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
# Extract title
mobj = re.search(r'contentTitle = "(.*?)";', webpage)
if mobj is None:
self._downloader.trouble(u'ERROR: unable to extract video title')
return
- video_title = mobj.group(1).decode('utf-8')
+ video_title = mobj.group(1)
# Extract description
video_description = u'No description available.'
mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
if mobj is not None:
- video_description = mobj.group(1).decode('utf-8')
+ video_description = mobj.group(1)
video_filename = video_url.split('/')[-1]
video_id, extension = video_filename.split('.')
if file_url is not None:
break # got it!
else:
- if req_format not in list(formats.keys()):
+ if req_format not in formats:
self._downloader.trouble(u'ERROR: format is not available')
return
_VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
IE_NAME = u'mtv'
- def report_webpage(self, video_id):
- """Report information extraction."""
- self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, video_id))
-
def report_extraction(self, video_id):
"""Report information extraction."""
self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
if not mobj.group('proto'):
url = 'http://' + url
video_id = mobj.group('videoid')
- self.report_webpage(video_id)
- request = compat_urllib_request.Request(url)
- try:
- webpage = compat_urllib_request.urlopen(request).read()
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
- return
+ webpage = self._download_webpage(url, video_id)
mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
if mobj is None:
class YoukuIE(InfoExtractor):
-
_VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
- IE_NAME = u'Youku'
-
- def __init__(self, downloader=None):
- InfoExtractor.__init__(self, downloader)
def report_download_webpage(self, file_id):
"""Report webpage download."""
- self._downloader.to_screen(u'[Youku] %s: Downloading webpage' % file_id)
+ self._downloader.to_screen(u'[%s] %s: Downloading webpage' % (self.IE_NAME, file_id))
def report_extraction(self, file_id):
"""Report information extraction."""
- self._downloader.to_screen(u'[Youku] %s: Extracting information' % file_id)
+ self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, file_id))
def _gen_sid(self):
nowTime = int(time.time() * 1000)
_VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
IE_NAME = u'nba'
- def report_extraction(self, video_id):
- self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
-
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
if video_id.endswith('/index.html'):
video_id = video_id[:-len('/index.html')]
- self.report_extraction(video_id)
- try:
- urlh = compat_urllib_request.urlopen(url)
- webpage_bytes = urlh.read()
- webpage = webpage_bytes.decode('utf-8', 'ignore')
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.trouble(u'ERROR: unable to download video info XML: %s' % compat_str(err))
- return
+ webpage = self._download_webpage(url, video_id)
video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
def _findProp(rexp, default=None):
break
offset += limit
return info
+
+class FunnyOrDieIE(InfoExtractor):
+ _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
+ return
+
+ video_id = mobj.group('id')
+ webpage = self._download_webpage(url, video_id)
+
+ m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
+ if not m:
+ self._downloader.trouble(u'ERROR: unable to find video information')
+ video_url = unescapeHTML(m.group('url'))
+
+ m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
+ if not m:
+ self._downloader.trouble(u'Cannot find video title')
+ title = unescapeHTML(m.group('title'))
+
+ m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
+ if m:
+ desc = unescapeHTML(m.group('desc'))
+ else:
+ desc = None
+
+ info = {
+ 'id': video_id,
+ 'url': video_url,
+ 'ext': 'mp4',
+ 'title': title,
+ 'description': desc,
+ }
+ return [info]
+
+class TweetReelIE(InfoExtractor):
+ _VALID_URL = r'^(?:https?://)?(?:www\.)?tweetreel\.com/[?](?P<id>[0-9a-z]+)$'
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
+ return
+
+ video_id = mobj.group('id')
+ webpage = self._download_webpage(url, video_id)
+
+ m = re.search(r'<div id="left" status_id="([0-9]+)">', webpage)
+ if not m:
+ self._downloader.trouble(u'ERROR: Cannot find status ID')
+ status_id = m.group(1)
+
+ m = re.search(r'<div class="tweet_text">(.*?)</div>', webpage, flags=re.DOTALL)
+ if not m:
+ self._downloader.trouble(u'WARNING: Cannot find description')
+ desc = unescapeHTML(re.sub('<a.*?</a>', '', m.group(1))).strip()
+
+ m = re.search(r'<div class="tweet_info">.*?from <a target="_blank" href="https?://twitter.com/(?P<uploader_id>.+?)">(?P<uploader>.+?)</a>', webpage, flags=re.DOTALL)
+ if not m:
+ self._downloader.trouble(u'ERROR: Cannot find uploader')
+ uploader = unescapeHTML(m.group('uploader'))
+ uploader_id = unescapeHTML(m.group('uploader_id'))
+
+ m = re.search(r'<span unixtime="([0-9]+)"', webpage)
+ if not m:
+ self._downloader.trouble(u'ERROR: Cannot find upload date')
+ upload_date = datetime.datetime.fromtimestamp(int(m.group(1))).strftime('%Y%m%d')
+
+ title = desc
+ video_url = 'http://files.tweetreel.com/video/' + status_id + '.mov'
+
+ info = {
+ 'id': video_id,
+ 'url': video_url,
+ 'ext': 'mov',
+ 'title': title,
+ 'description': desc,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'internal_id': status_id,
+ 'upload_date': upload_date
+ }
+ return [info]
+
+class SteamIE(InfoExtractor):
+ _VALID_URL = r"""http://store.steampowered.com/
+ (?P<urltype>video|app)/ #If the page is only for videos or for a game
+ (?P<gameID>\d+)/?
+ (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
+ """
+
+ def suitable(self, url):
+ """Receives a URL and returns True if suitable for this IE."""
+ return re.match(self._VALID_URL, url, re.VERBOSE) is not None
+
+ def _real_extract(self, url):
+ m = re.match(self._VALID_URL, url, re.VERBOSE)
+ urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
+ gameID = m.group('gameID')
+ videourl = 'http://store.steampowered.com/video/%s/' % gameID
+ webpage = self._download_webpage(videourl, gameID)
+ mweb = re.finditer(urlRE, webpage)
+ namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
+ titles = re.finditer(namesRE, webpage)
+ videos = []
+ for vid,vtitle in zip(mweb,titles):
+ video_id = vid.group('videoID')
+ title = vtitle.group('videoName')
+ video_url = vid.group('videoURL')
+ if not video_url:
+ self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
+ info = {
+ 'id':video_id,
+ 'url':video_url,
+ 'ext': 'flv',
+ 'title': unescapeHTML(title)
+ }
+ videos.append(info)
+ return videos
+
+class UstreamIE(InfoExtractor):
+ _VALID_URL = r'http://www.ustream.tv/recorded/(?P<videoID>\d+)'
+ IE_NAME = u'ustream'
+
+ def _real_extract(self, url):
+ m = re.match(self._VALID_URL, url)
+ video_id = m.group('videoID')
+ video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
+ webpage = self._download_webpage(url, video_id)
+ m = re.search(r'data-title="(?P<title>.+)"',webpage)
+ title = m.group('title')
+ m = re.search(r'<a class="state" data-content-type="channel" data-content-id="(?P<uploader>\d+)"',webpage)
+ uploader = m.group('uploader')
+ info = {
+ 'id':video_id,
+ 'url':video_url,
+ 'ext': 'flv',
+ 'title': title,
+ 'uploader': uploader
+ }
+ return [info]
+
+
+def gen_extractors():
+ """ Return a list of an instance of every supported extractor.
+ The order does matter; the first extractor matched is the one handling the URL.
+ """
+ return [
+ YoutubePlaylistIE(),
+ YoutubeChannelIE(),
+ YoutubeUserIE(),
+ YoutubeSearchIE(),
+ YoutubeIE(),
+ MetacafeIE(),
+ DailymotionIE(),
+ GoogleSearchIE(),
+ PhotobucketIE(),
+ YahooIE(),
+ YahooSearchIE(),
+ DepositFilesIE(),
+ FacebookIE(),
+ BlipTVUserIE(),
+ BlipTVIE(),
+ VimeoIE(),
+ MyVideoIE(),
+ ComedyCentralIE(),
+ EscapistIE(),
+ CollegeHumorIE(),
+ XVideosIE(),
+ SoundcloudIE(),
+ InfoQIE(),
+ MixcloudIE(),
+ StanfordOpenClassroomIE(),
+ MTVIE(),
+ YoukuIE(),
+ XNXXIE(),
+ GooglePlusIE(),
+ ArteTvIE(),
+ NBAIE(),
+ JustinTVIE(),
+ FunnyOrDieIE(),
+ TweetReelIE(),
+ SteamIE(),
+ UstreamIE(),
+ GenericIE()
+ ]
+
+