X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2FInfoExtractors.py;h=8b2442bacc21e86db7ffdc920116b51e79fd3820;hb=5e34d2ebbf9906bded4201d7bd8bb82e9353de9f;hp=208b44887545ee1bba04e598787e6c574d5bd5cf;hpb=69fc019f268116a4b5dcccca00ddf9153748c305;p=youtube-dl
diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py
index 208b44887..8b2442bac 100755
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -114,8 +114,8 @@ class InfoExtractor(object):
def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
""" Returns the response handle """
if note is None:
- note = u'Downloading video webpage'
- if note is not False:
+ self.report_download_webpage(video_id)
+ elif note is not False:
self.to_screen(u'%s: %s' % (video_id, note))
try:
return compat_urllib_request.urlopen(url_or_request)
@@ -152,6 +152,10 @@ class InfoExtractor(object):
"""Report information extraction."""
self.to_screen(u'%s: Extracting information' % id_or_name)
+ def report_download_webpage(self, video_id):
+ """Report webpage download."""
+ self.to_screen(u'%s: Downloading webpage' % video_id)
+
def report_age_confirmation(self):
"""Report attempt to confirm age."""
self.to_screen(u'Confirming age')
@@ -558,19 +562,18 @@ class YoutubeIE(InfoExtractor):
mobj = re.search(r'id="eow-date.*?>(.*?)', video_webpage, re.DOTALL)
if mobj is not None:
upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
- format_expressions = ['%d %B %Y', '%B %d %Y', '%b %d %Y']
- for expression in format_expressions:
- try:
- upload_date = datetime.datetime.strptime(upload_date, expression).strftime('%Y%m%d')
- except:
- pass
+ upload_date = unified_strdate(upload_date)
# description
video_description = get_element_by_id("eow-description", video_webpage)
if video_description:
video_description = clean_html(video_description)
else:
- video_description = u''
+ fd_mobj = re.search(r'https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?Pplay_redirect_hls\?clip_id=)?(?:videos?/)?(?P[0-9]+)'
IE_NAME = u'vimeo'
- def __init__(self, downloader=None):
- InfoExtractor.__init__(self, downloader)
-
- def report_download_webpage(self, video_id):
- """Report webpage download."""
- self.to_screen(u'%s: Downloading webpage' % video_id)
-
def _real_extract(self, url, new_video=True):
# Extract ID from URL
mobj = re.match(self._VALID_URL, url)
@@ -1098,13 +1068,7 @@ class VimeoIE(InfoExtractor):
# Retrieve video webpage to extract further information
request = compat_urllib_request.Request(url, None, std_headers)
- try:
- self.report_download_webpage(video_id)
- webpage_bytes = compat_urllib_request.urlopen(request).read()
- webpage = webpage_bytes.decode('utf-8')
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.report_error(u'Unable to retrieve video webpage: %s' % compat_str(err))
- return
+ webpage = self._download_webpage(request, video_id)
# Now we begin extracting as much information as we can from what we
# retrieved. First we extract the information common to all extractors,
@@ -1116,7 +1080,10 @@ class VimeoIE(InfoExtractor):
config = webpage.split(' = {config:')[1].split(',assets:')[0]
config = json.loads(config)
except:
- self._downloader.report_error(u'unable to extract info section')
+ if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
+ self._downloader.report_error(u'The author has restricted the access to this video, try with the "--referer" option')
+ else:
+ self._downloader.report_error(u'unable to extract info section')
return
# Extract title
@@ -1193,13 +1160,6 @@ class ArteTvIE(InfoExtractor):
IE_NAME = u'arte.tv'
- def __init__(self, downloader=None):
- InfoExtractor.__init__(self, downloader)
-
- def report_download_webpage(self, video_id):
- """Report webpage download."""
- self.to_screen(u'%s: Downloading webpage' % video_id)
-
def fetch_webpage(self, url):
request = compat_urllib_request.Request(url)
try:
@@ -1323,14 +1283,11 @@ class GenericIE(InfoExtractor):
_VALID_URL = r'.*'
IE_NAME = u'generic'
- def __init__(self, downloader=None):
- InfoExtractor.__init__(self, downloader)
-
def report_download_webpage(self, video_id):
"""Report webpage download."""
if not self._downloader.params.get('test', False):
self._downloader.report_warning(u'Falling back on generic information extractor.')
- self.to_screen(u'%s: Downloading webpage' % video_id)
+ super(GenericIE, self).report_download_webpage(video_id)
def report_following_redirect(self, new_url):
"""Report information extraction."""
@@ -1465,9 +1422,6 @@ class YoutubeSearchIE(InfoExtractor):
_max_youtube_results = 1000
IE_NAME = u'youtube:search'
- def __init__(self, downloader=None):
- InfoExtractor.__init__(self, downloader)
-
def report_download_page(self, query, pagenum):
"""Report attempt to download search page with given number."""
query = query.decode(preferredencoding())
@@ -1542,9 +1496,6 @@ class GoogleSearchIE(InfoExtractor):
_max_google_results = 1000
IE_NAME = u'video.google:search'
- def __init__(self, downloader=None):
- InfoExtractor.__init__(self, downloader)
-
def report_download_page(self, query, pagenum):
"""Report attempt to download playlist page with given number."""
query = query.decode(preferredencoding())
@@ -1626,9 +1577,6 @@ class YahooSearchIE(InfoExtractor):
_max_yahoo_results = 1000
IE_NAME = u'video.yahoo:search'
- def __init__(self, downloader=None):
- InfoExtractor.__init__(self, downloader)
-
def report_download_page(self, query, pagenum):
"""Report attempt to download playlist page with given number."""
query = query.decode(preferredencoding())
@@ -1722,18 +1670,11 @@ class YoutubePlaylistIE(InfoExtractor):
_MAX_RESULTS = 50
IE_NAME = u'youtube:playlist'
- def __init__(self, downloader=None):
- InfoExtractor.__init__(self, downloader)
-
@classmethod
def suitable(cls, url):
"""Receives a URL and returns True if suitable for this IE."""
return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
- def report_download_page(self, playlist_id, pagenum):
- """Report attempt to download playlist page with given number."""
- self._downloader.to_screen(u'[youtube] PL %s: Downloading page #%s' % (playlist_id, pagenum))
-
def _real_extract(self, url):
# Extract playlist id
mobj = re.match(self._VALID_URL, url, re.VERBOSE)
@@ -1747,14 +1688,8 @@ class YoutubePlaylistIE(InfoExtractor):
videos = []
while True:
- self.report_download_page(playlist_id, page_num)
-
url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
- try:
- page = compat_urllib_request.urlopen(url).read().decode('utf8')
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
- return
+ page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
try:
response = json.loads(page)
@@ -1765,12 +1700,11 @@ class YoutubePlaylistIE(InfoExtractor):
if 'feed' not in response:
self._downloader.report_error(u'Got a malformed response from YouTube API')
return
+ playlist_title = response['feed']['title']['$t']
if 'entry' not in response['feed']:
# Number of videos is a multiple of self._MAX_RESULTS
break
- playlist_title = response['feed']['title']['$t']
-
videos += [ (entry['yt$position']['$t'], entry['content']['src'])
for entry in response['feed']['entry']
if 'content' in entry ]
@@ -1794,10 +1728,6 @@ class YoutubeChannelIE(InfoExtractor):
_MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
IE_NAME = u'youtube:channel'
- def report_download_page(self, channel_id, pagenum):
- """Report attempt to download channel page with given number."""
- self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
-
def extract_videos_from_page(self, page):
ids_in_page = []
for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
@@ -1817,14 +1747,9 @@ class YoutubeChannelIE(InfoExtractor):
video_ids = []
pagenum = 1
- self.report_download_page(channel_id, pagenum)
url = self._TEMPLATE_URL % (channel_id, pagenum)
- request = compat_urllib_request.Request(url)
- try:
- page = compat_urllib_request.urlopen(request).read().decode('utf8')
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
- return
+ page = self._download_webpage(url, channel_id,
+ u'Downloading page #%s' % pagenum)
# Extract video identifiers
ids_in_page = self.extract_videos_from_page(page)
@@ -1835,14 +1760,9 @@ class YoutubeChannelIE(InfoExtractor):
while True:
pagenum = pagenum + 1
- self.report_download_page(channel_id, pagenum)
url = self._MORE_PAGES_URL % (pagenum, channel_id)
- request = compat_urllib_request.Request(url)
- try:
- page = compat_urllib_request.urlopen(request).read().decode('utf8')
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
- return
+ page = self._download_webpage(url, channel_id,
+ u'Downloading page #%s' % pagenum)
page = json.loads(page)
@@ -1869,14 +1789,6 @@ class YoutubeUserIE(InfoExtractor):
_VIDEO_INDICATOR = r'/watch\?v=(.+?)[\<&]'
IE_NAME = u'youtube:user'
- def __init__(self, downloader=None):
- InfoExtractor.__init__(self, downloader)
-
- def report_download_page(self, username, start_index):
- """Report attempt to download user page."""
- self._downloader.to_screen(u'[youtube] user %s: Downloading video ids from %d to %d' %
- (username, start_index, start_index + self._GDATA_PAGE_SIZE))
-
def _real_extract(self, url):
# Extract username
mobj = re.match(self._VALID_URL, url)
@@ -1896,15 +1808,10 @@ class YoutubeUserIE(InfoExtractor):
while True:
start_index = pagenum * self._GDATA_PAGE_SIZE + 1
- self.report_download_page(username, start_index)
- request = compat_urllib_request.Request(self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index))
-
- try:
- page = compat_urllib_request.urlopen(request).read().decode('utf-8')
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
- return
+ gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
+ page = self._download_webpage(gdata_url, username,
+ u'Downloading video ids from %d to %d' % (start_index, start_index + self._GDATA_PAGE_SIZE))
# Extract video identifiers
ids_in_page = []
@@ -1938,14 +1845,6 @@ class BlipTVUserIE(InfoExtractor):
_PAGE_SIZE = 12
IE_NAME = u'blip.tv:user'
- def __init__(self, downloader=None):
- InfoExtractor.__init__(self, downloader)
-
- def report_download_page(self, username, pagenum):
- """Report attempt to download user page."""
- self.to_screen(u'user %s: Downloading video ids from page %d' %
- (username, pagenum))
-
def _real_extract(self, url):
# Extract username
mobj = re.match(self._VALID_URL, url)
@@ -1957,15 +1856,9 @@ class BlipTVUserIE(InfoExtractor):
page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
- request = compat_urllib_request.Request(url)
-
- try:
- page = compat_urllib_request.urlopen(request).read().decode('utf-8')
- mobj = re.search(r'data-users-id="([^"]+)"', page)
- page_base = page_base % mobj.group(1)
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
- return
+ page = self._download_webpage(url, username, u'Downloading user page')
+ mobj = re.search(r'data-users-id="([^"]+)"', page)
+ page_base = page_base % mobj.group(1)
# Download video ids using BlipTV Ajax calls. Result size per
@@ -1977,14 +1870,9 @@ class BlipTVUserIE(InfoExtractor):
pagenum = 1
while True:
- self.report_download_page(username, pagenum)
url = page_base + "&page=" + str(pagenum)
- request = compat_urllib_request.Request( url )
- try:
- page = compat_urllib_request.urlopen(request).read().decode('utf-8')
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.report_error(u'unable to download webpage: %s' % str(err))
- return
+ page = self._download_webpage(url, username,
+ u'Downloading video ids from page %d' % pagenum)
# Extract video identifiers
ids_in_page = []
@@ -2016,10 +1904,6 @@ class DepositFilesIE(InfoExtractor):
_VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
- def report_download_webpage(self, file_id):
- """Report webpage download."""
- self.to_screen(u'%s: Downloading webpage' % file_id)
-
def _real_extract(self, url):
file_id = url.split('/')[-1]
# Rebuild url in english locale
@@ -2270,9 +2154,6 @@ class MyVideoIE(InfoExtractor):
_VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
IE_NAME = u'myvideo'
- def __init__(self, downloader=None):
- InfoExtractor.__init__(self, downloader)
-
def _real_extract(self,url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
@@ -2350,12 +2231,6 @@ class ComedyCentralIE(InfoExtractor):
"""Receives a URL and returns True if suitable for this IE."""
return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
- def report_config_download(self, episode_id, media_id):
- self.to_screen(u'%s: Downloading configuration for %s' % (episode_id, media_id))
-
- def report_index_download(self, episode_id):
- self.to_screen(u'%s: Downloading show index' % episode_id)
-
def _print_formats(self, formats):
print('Available formats:')
for x in formats:
@@ -2389,15 +2264,8 @@ class ComedyCentralIE(InfoExtractor):
else:
epTitle = mobj.group('episode')
- req = compat_urllib_request.Request(url)
self.report_extraction(epTitle)
- try:
- htmlHandle = compat_urllib_request.urlopen(req)
- html = htmlHandle.read()
- webpage = html.decode('utf-8')
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
- return
+ webpage = self._download_webpage(url, epTitle)
if dlNewest:
url = htmlHandle.geturl()
mobj = re.match(self._VALID_URL, url, re.VERBOSE)
@@ -2425,12 +2293,9 @@ class ComedyCentralIE(InfoExtractor):
uri = mMovieParams[0][1]
indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
- self.report_index_download(epTitle)
- try:
- indexXml = compat_urllib_request.urlopen(indexUrl).read()
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.report_error(u'unable to download episode index: ' + compat_str(err))
- return
+ indexXml = self._download_webpage(indexUrl, epTitle,
+ u'Downloading show index',
+ u'unable to download episode index')
results = []
@@ -2441,17 +2306,12 @@ class ComedyCentralIE(InfoExtractor):
shortMediaId = mediaId.split(':')[-1]
showId = mediaId.split(':')[-2].replace('.com', '')
officialTitle = itemEl.findall('./title')[0].text
- officialDate = itemEl.findall('./pubDate')[0].text
+ officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
compat_urllib_parse.urlencode({'uri': mediaId}))
- configReq = compat_urllib_request.Request(configUrl)
- self.report_config_download(epTitle, shortMediaId)
- try:
- configXml = compat_urllib_request.urlopen(configReq).read()
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
- return
+ configXml = self._download_webpage(configUrl, epTitle,
+ u'Downloading configuration for %s' % shortMediaId)
cdoc = xml.etree.ElementTree.fromstring(configXml)
turls = []
@@ -2508,9 +2368,6 @@ class EscapistIE(InfoExtractor):
_VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P[^/]+)/(?P[^/?]+)[/?]?.*$'
IE_NAME = u'escapist'
- def report_config_download(self, showName):
- self.to_screen(u'%s: Downloading configuration' % showName)
-
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
@@ -2520,14 +2377,7 @@ class EscapistIE(InfoExtractor):
videoId = mobj.group('episode')
self.report_extraction(showName)
- try:
- webPage = compat_urllib_request.urlopen(url)
- webPageBytes = webPage.read()
- m = re.match(r'text/html; charset="?([^"]+)"?', webPage.headers['Content-Type'])
- webPage = webPageBytes.decode(m.group(1) if m else 'utf-8')
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.report_error(u'unable to download webpage: ' + compat_str(err))
- return
+ webPage = self._download_webpage(url, showName)
descMatch = re.search('/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P[^&]+)(&video=(?P