X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;ds=sidebyside;f=youtube_dl%2FInfoExtractors.py;h=697c031c5119176874558aef23eadb163e45fcdd;hb=f0648fc18c91fde0c2db2e52c1cf78fe6bf3bbc4;hp=9766dea54f94e2b3a38299cf534da6c295d98f3b;hpb=f7b111b7d1b4a7d7d7a8fc99790b3af7d6cff8ad;p=youtube-dl
diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py
index 9766dea54..697c031c5 100755
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -23,7 +23,7 @@ class InfoExtractor(object):
Information extractors are the classes that, given a URL, extract
information about the video (or videos) the URL refers to. This
information includes the real video URL, the video title, author and
- others. The information is stored in a dictionary which is then
+ others. The information is stored in a dictionary which is then
passed to the FileDownloader. The FileDownloader processes this
information possibly downloading the video to the file system, among
other possible outcomes.
@@ -32,7 +32,7 @@ class InfoExtractor(object):
id: Video identifier.
url: Final video URL.
- uploader: Nickname of the video uploader, unescaped.
+ uploader: Full name of the video uploader, unescaped.
upload_date: Video upload date (YYYYMMDD).
title: Video title, unescaped.
ext: Video filename extension.
@@ -42,6 +42,7 @@ class InfoExtractor(object):
format: The video format, defaults to ext (used for --get-format)
thumbnail: Full URL to a video thumbnail image.
description: One-line video description.
+ uploader_id: Nickname or id of the video uploader.
player_url: SWF Player URL (used for rtmpdump).
subtitles: The .srt file contents.
urlhandle: [internal] The urlHandle to be used to download the file,
@@ -159,7 +160,7 @@ class YoutubeIE(InfoExtractor):
'44': '480x854',
'45': '720x1280',
'46': '1080x1920',
- }
+ }
IE_NAME = u'youtube'
def suitable(self, url):
@@ -219,6 +220,34 @@ class YoutubeIE(InfoExtractor):
srt += caption + '\n\n'
return srt
+ def _extract_subtitles(self, video_id):
+ self.report_video_subtitles_download(video_id)
+ request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
+ try:
+ srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
+ except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+ return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
+ srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
+ srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
+ if not srt_lang_list:
+ return (u'WARNING: video has no closed captions', None)
+ if self._downloader.params.get('subtitleslang', False):
+ srt_lang = self._downloader.params.get('subtitleslang')
+ elif 'en' in srt_lang_list:
+ srt_lang = 'en'
+ else:
+ srt_lang = list(srt_lang_list.keys())[0]
+ if not srt_lang in srt_lang_list:
+ return (u'WARNING: no closed captions found in the specified language', None)
+ request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
+ try:
+ srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
+ except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+ return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
+ if not srt_xml:
+ return (u'WARNING: unable to download video subtitles', None)
+ return (None, self._closed_captions_xml_to_srt(srt_xml))
+
def _print_formats(self, formats):
print('Available formats:')
for x in formats:
@@ -356,10 +385,18 @@ class YoutubeIE(InfoExtractor):
# uploader
if 'author' not in video_info:
- self._downloader.trouble(u'ERROR: unable to extract uploader nickname')
+ self._downloader.trouble(u'ERROR: unable to extract uploader name')
return
video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
+ # uploader_id
+ video_uploader_id = None
+ mobj = re.search(r'', video_webpage)
+ if mobj is not None:
+ video_uploader_id = mobj.group(1)
+ else:
+ self._downloader.trouble(u'WARNING: unable to extract uploader nickname')
+
# title
if 'title' not in video_info:
self._downloader.trouble(u'ERROR: unable to extract video title')
@@ -395,35 +432,9 @@ class YoutubeIE(InfoExtractor):
# closed captions
video_subtitles = None
if self._downloader.params.get('writesubtitles', False):
- try:
- self.report_video_subtitles_download(video_id)
- request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
- try:
- srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
- srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
- srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
- if not srt_lang_list:
- raise Trouble(u'WARNING: video has no closed captions')
- if self._downloader.params.get('subtitleslang', False):
- srt_lang = self._downloader.params.get('subtitleslang')
- elif 'en' in srt_lang_list:
- srt_lang = 'en'
- else:
- srt_lang = srt_lang_list.keys()[0]
- if not srt_lang in srt_lang_list:
- raise Trouble(u'WARNING: no closed captions found in the specified language')
- request = compat_urllib_request.Request('http://www.youtube.com/api/timedtext?lang=%s&name=%s&v=%s' % (srt_lang, srt_lang_list[srt_lang], video_id))
- try:
- srt_xml = compat_urllib_request.urlopen(request).read().decode('utf-8')
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- raise Trouble(u'WARNING: unable to download video subtitles: %s' % compat_str(err))
- if not srt_xml:
- raise Trouble(u'WARNING: unable to download video subtitles')
- video_subtitles = self._closed_captions_xml_to_srt(srt_xml)
- except Trouble as trouble:
- self._downloader.trouble(str(trouble))
+ (srt_error, video_subtitles) = self._extract_subtitles(video_id)
+ if srt_error:
+ self._downloader.trouble(srt_error)
if 'length_seconds' not in video_info:
self._downloader.trouble(u'WARNING: unable to extract video duration')
@@ -443,7 +454,7 @@ class YoutubeIE(InfoExtractor):
elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
url_data_strs = video_info['url_encoded_fmt_stream_map'][0].split(',')
url_data = [compat_parse_qs(uds) for uds in url_data_strs]
- url_data = filter(lambda ud: 'itag' in ud and 'url' in ud, url_data)
+ url_data = [ud for ud in url_data if 'itag' in ud and 'url' in ud]
url_map = dict((ud['itag'][0], ud['url'][0] + '&signature=' + ud['sig'][0]) for ud in url_data)
format_limit = self._downloader.params.get('format_limit', None)
@@ -493,6 +504,7 @@ class YoutubeIE(InfoExtractor):
'id': video_id,
'url': video_real_url,
'uploader': video_uploader,
+ 'uploader_id': video_uploader_id,
'upload_date': upload_date,
'title': video_title,
'ext': video_extension,
@@ -799,6 +811,7 @@ class PhotobucketIE(InfoExtractor):
class YahooIE(InfoExtractor):
"""Information extractor for video.yahoo.com."""
+ _WORKING = False
# _VALID_URL matches all Yahoo! Video URLs
# _VPAGE_URL matches only the extractable '/watch/' URLs
_VALID_URL = r'(?:http://)?(?:[a-z]+\.)?video\.yahoo\.com/(?:watch|network)/([0-9]+)(?:/|\?v=)([0-9]+)(?:[#\?].*)?'
@@ -987,26 +1000,27 @@ class VimeoIE(InfoExtractor):
except:
self._downloader.trouble(u'ERROR: unable to extract info section')
return
-
+
# Extract title
video_title = config["video"]["title"]
- # Extract uploader
+ # Extract uploader and uploader_id
video_uploader = config["video"]["owner"]["name"]
+ video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
# Extract video thumbnail
video_thumbnail = config["video"]["thumbnail"]
# Extract video description
- video_description = get_element_by_id("description", webpage)
+ video_description = get_element_by_attribute("itemprop", "description", webpage)
if video_description: video_description = clean_html(video_description)
else: video_description = ''
# Extract upload date
video_upload_date = None
- mobj = re.search(r'[^:]*: (.*?)( \([^\(]*\))?', webpage)
+ mobj = re.search(r' 0:
+ if len(list(url_map.keys())) > 0:
# Decide which formats to download
req_format = self._downloader.params.get('format', None)
format_limit = self._downloader.params.get('format_limit', None)
@@ -2253,7 +2270,7 @@ class MyVideoIE(InfoExtractor):
def __init__(self, downloader=None):
InfoExtractor.__init__(self, downloader)
-
+
def report_download_webpage(self, video_id):
"""Report webpage download."""
self._downloader.to_screen(u'[myvideo] %s: Downloading webpage' % video_id)
@@ -2274,7 +2291,7 @@ class MyVideoIE(InfoExtractor):
request = compat_urllib_request.Request('http://www.myvideo.de/watch/%s' % video_id)
try:
self.report_download_webpage(video_id)
- webpage = compat_urllib_request.urlopen(request).read()
+ webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
return
@@ -2307,10 +2324,10 @@ class ComedyCentralIE(InfoExtractor):
"""Information extractor for The Daily Show and Colbert Report """
# urls can be abbreviations like :thedailyshow or :colbert
- # urls for episodes like:
+ # urls for episodes like:
# or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
# or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
- # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
+ # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
_VALID_URL = r"""^(:(?Ptds|thedailyshow|cr|colbert|colbertnation|colbertreport)
|(https?://)?(www\.)?
(?Pthedailyshow|colbertnation)\.com/
@@ -2318,7 +2335,7 @@ class ComedyCentralIE(InfoExtractor):
(?P
(the-colbert-report-(videos|collections)/(?P[0-9]+)/[^/]*/(?P.*?))
|(watch/(?P[^/]*)/(?P.*)))))
- $"""
+ $"""
IE_NAME = u'comedycentral'
_available_formats = ['3500', '2200', '1700', '1200', '750', '400']
@@ -2422,7 +2439,7 @@ class ComedyCentralIE(InfoExtractor):
return
else:
mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
-
+
playerUrl_raw = mMovieParams[0][0]
self.report_player_url(epTitle)
try:
@@ -2471,7 +2488,7 @@ class ComedyCentralIE(InfoExtractor):
if len(turls) == 0:
self._downloader.trouble(u'\nERROR: unable to download ' + mediaId + ': No videos found')
continue
-
+
if self._downloader.params.get('listformats', None):
self._print_formats([i[0] for i in turls])
return
@@ -2511,7 +2528,7 @@ class ComedyCentralIE(InfoExtractor):
}
results.append(info)
-
+
return results
@@ -2556,7 +2573,9 @@ class EscapistIE(InfoExtractor):
self.report_config_download(showName)
try:
- configJSON = compat_urllib_request.urlopen(configUrl).read()
+ configJSON = compat_urllib_request.urlopen(configUrl)
+ m = re.match(r'text/html; charset="?([^"]+)"?', configJSON.headers['Content-Type'])
+ configJSON = configJSON.read().decode(m.group(1) if m else 'utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.trouble(u'ERROR: unable to download configuration: ' + compat_str(err))
return
@@ -2876,6 +2895,8 @@ class InfoQIE(InfoExtractor):
class MixcloudIE(InfoExtractor):
"""Information extractor for www.mixcloud.com"""
+
+ _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
_VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
IE_NAME = u'mixcloud'
@@ -2966,7 +2987,7 @@ class MixcloudIE(InfoExtractor):
if file_url is not None:
break # got it!
else:
- if req_format not in formats.keys():
+ if req_format not in list(formats.keys()):
self._downloader.trouble(u'ERROR: format is not available')
return
@@ -3071,7 +3092,7 @@ class StanfordOpenClassroomIE(InfoExtractor):
assert entry['type'] == 'reference'
results += self.extract(entry['url'])
return results
-
+
else: # Root page
info = {
'id': 'Stanford OpenClassroom',
@@ -3145,7 +3166,7 @@ class MTVIE(InfoExtractor):
self._downloader.trouble(u'ERROR: unable to extract performer')
return
performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
- video_title = performer + ' - ' + song_name
+ video_title = performer + ' - ' + song_name
mobj = re.search(r'', webpage)
if mobj is None:
@@ -3265,7 +3286,7 @@ class YoukuIE(InfoExtractor):
seed = config['data'][0]['seed']
format = self._downloader.params.get('format', None)
- supported_format = config['data'][0]['streamfileids'].keys()
+ supported_format = list(config['data'][0]['streamfileids'].keys())
if format is None or format == 'best':
if 'hd2' in supported_format:
@@ -3378,7 +3399,7 @@ class XNXXIE(InfoExtractor):
class GooglePlusIE(InfoExtractor):
"""Information extractor for plus.google.com."""
- _VALID_URL = r'(?:https://)?plus\.google\.com/(?:\w+/)*?(\d+)/posts/(\w+)'
+ _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
IE_NAME = u'plus.google'
def __init__(self, downloader=None):
@@ -3386,7 +3407,7 @@ class GooglePlusIE(InfoExtractor):
def report_extract_entry(self, url):
"""Report downloading extry"""
- self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url.decode('utf-8'))
+ self._downloader.to_screen(u'[plus.google] Downloading entry: %s' % url)
def report_date(self, upload_date):
"""Report downloading extry"""
@@ -3394,15 +3415,15 @@ class GooglePlusIE(InfoExtractor):
def report_uploader(self, uploader):
"""Report downloading extry"""
- self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader.decode('utf-8'))
+ self._downloader.to_screen(u'[plus.google] Uploader: %s' % uploader)
def report_title(self, video_title):
"""Report downloading extry"""
- self._downloader.to_screen(u'[plus.google] Title: %s' % video_title.decode('utf-8'))
+ self._downloader.to_screen(u'[plus.google] Title: %s' % video_title)
def report_extract_vid_page(self, video_page):
"""Report information extraction."""
- self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page.decode('utf-8'))
+ self._downloader.to_screen(u'[plus.google] Extracting video page: %s' % video_page)
def _real_extract(self, url):
# Extract id from URL
@@ -3412,7 +3433,7 @@ class GooglePlusIE(InfoExtractor):
return
post_url = mobj.group(0)
- video_id = mobj.group(2)
+ video_id = mobj.group(1)
video_extension = 'flv'
@@ -3420,7 +3441,7 @@ class GooglePlusIE(InfoExtractor):
self.report_extract_entry(post_url)
request = compat_urllib_request.Request(post_url)
try:
- webpage = compat_urllib_request.urlopen(request).read()
+ webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.trouble(u'ERROR: Unable to retrieve entry webpage: %s' % compat_str(err))
return
@@ -3462,7 +3483,7 @@ class GooglePlusIE(InfoExtractor):
video_page = mobj.group(1)
request = compat_urllib_request.Request(video_page)
try:
- webpage = compat_urllib_request.urlopen(request).read()
+ webpage = compat_urllib_request.urlopen(request).read().decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.trouble(u'ERROR: Unable to retrieve video webpage: %s' % compat_str(err))
return
@@ -3484,16 +3505,19 @@ class GooglePlusIE(InfoExtractor):
# Only get the url. The resolution part in the tuple has no use anymore
video_url = video_url[-1]
# Treat escaped \u0026 style hex
- video_url = unicode(video_url, "unicode_escape")
+ try:
+ video_url = video_url.decode("unicode_escape")
+ except AttributeError: # Python 3
+ video_url = bytes(video_url, 'ascii').decode('unicode-escape')
return [{
- 'id': video_id.decode('utf-8'),
+ 'id': video_id,
'url': video_url,
- 'uploader': uploader.decode('utf-8'),
- 'upload_date': upload_date.decode('utf-8'),
- 'title': video_title.decode('utf-8'),
- 'ext': video_extension.decode('utf-8'),
+ 'uploader': uploader,
+ 'upload_date': upload_date,
+ 'title': video_title,
+ 'ext': video_extension,
}]
class NBAIE(InfoExtractor):
@@ -3571,7 +3595,7 @@ class JustinTVIE(InfoExtractor):
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
self._downloader.trouble(u'ERROR: unable to download video info JSON: %s' % compat_str(err))
return
-
+
response = json.loads(webpage)
info = []
for clip in response:
@@ -3594,7 +3618,7 @@ class JustinTVIE(InfoExtractor):
if mobj is None:
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
return
-
+
api = 'http://api.justin.tv'
video_id = mobj.group(mobj.lastindex)
paged = False
@@ -3604,9 +3628,9 @@ class JustinTVIE(InfoExtractor):
else:
api += '/clip/show/%s.json'
api = api % (video_id,)
-
+
self.report_extraction(video_id)
-
+
info = []
offset = 0
limit = self._JUSTIN_PAGE_LIMIT
@@ -3620,3 +3644,52 @@ class JustinTVIE(InfoExtractor):
break
offset += limit
return info
+
+class FunnyOrDieIE(InfoExtractor):
+ _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P[0-9a-f]+)/.*$'
+ IE_NAME = u'FunnyOrDie'
+
+ def report_extraction(self, video_id):
+ self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ if mobj is None:
+ self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
+ return
+
+ video_id = mobj.group('id')
+ self.report_extraction(video_id)
+ try:
+ urlh = compat_urllib_request.urlopen(url)
+ webpage_bytes = urlh.read()
+ webpage = webpage_bytes.decode('utf-8', 'ignore')
+ except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+ self._downloader.trouble(u'ERROR: unable to download webpage: %s' % compat_str(err))
+ return
+
+ m = re.search(r'