X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2FInfoExtractors.py;h=81eaddc7260333e70a01be3ea354edfc7513f339;hb=14294236bf9623fac4ad748389689b77d52b3547;hp=64079d1fd4e13af4c64379f00a72bab426839198;hpb=7f9d41a55edb8e939c1bb69f0c3ad29380cad478;p=youtube-dl
diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py
index 64079d1fd..81eaddc72 100755
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -48,7 +48,7 @@ class InfoExtractor(object):
uploader_id: Nickname or id of the video uploader.
location: Physical location of the video.
player_url: SWF Player URL (used for rtmpdump).
- subtitles: The .srt file contents.
+ subtitles: The subtitle file contents.
urlhandle: [internal] The urlHandle to be used to download the file,
like returned by urllib.request.urlopen
@@ -126,8 +126,36 @@ class InfoExtractor(object):
def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
""" Returns the data of the page as a string """
urlh = self._request_webpage(url_or_request, video_id, note, errnote)
+ content_type = urlh.headers.get('Content-Type', '')
+ m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
+ if m:
+ encoding = m.group(1)
+ else:
+ encoding = 'utf-8'
webpage_bytes = urlh.read()
- return webpage_bytes.decode('utf-8', 'replace')
+ return webpage_bytes.decode(encoding, 'replace')
+
+ #Methods for following #608
+ #They set the correct value of the '_type' key
+ def video_result(self, video_info):
+ """Returns a video"""
+ video_info['_type'] = 'video'
+ return video_info
+ def url_result(self, url, ie=None):
+ """Returns a url that points to a page that should be processed"""
+ #TODO: ie should be the class used for getting the info
+ video_info = {'_type': 'url',
+ 'url': url}
+ return video_info
+ def playlist_result(self, entries, playlist_id=None, playlist_title=None):
+ """Returns a playlist"""
+ video_info = {'_type': 'playlist',
+ 'entries': entries}
+ if playlist_id:
+ video_info['id'] = playlist_id
+ if playlist_title:
+ video_info['title'] = playlist_title
+ return video_info
class YoutubeIE(InfoExtractor):
@@ -218,7 +246,16 @@ class YoutubeIE(InfoExtractor):
def report_video_subtitles_download(self, video_id):
"""Report attempt to download video info webpage."""
- self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles' % video_id)
+ self._downloader.to_screen(u'[youtube] %s: Checking available subtitles' % video_id)
+
+ def report_video_subtitles_request(self, video_id, sub_lang, format):
+ """Report attempt to download video info webpage."""
+ self._downloader.to_screen(u'[youtube] %s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
+
+ def report_video_subtitles_available(self, video_id, sub_lang_list):
+ """Report available subtitles."""
+ sub_lang = ",".join(list(sub_lang_list.keys()))
+ self._downloader.to_screen(u'[youtube] %s: Available subtitles for video: %s' % (video_id, sub_lang))
def report_information_extraction(self, video_id):
"""Report attempt to extract video information."""
@@ -232,55 +269,75 @@ class YoutubeIE(InfoExtractor):
"""Indicate the download will use the RTMP protocol."""
self._downloader.to_screen(u'[youtube] RTMP download detected')
- def _closed_captions_xml_to_srt(self, xml_string):
- srt = ''
- texts = re.findall(r'([^<]+)', xml_string, re.MULTILINE)
- # TODO parse xml instead of regex
- for n, (start, dur_tag, dur, caption) in enumerate(texts):
- if not dur: dur = '4'
- start = float(start)
- end = start + float(dur)
- start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
- end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
- caption = unescapeHTML(caption)
- caption = unescapeHTML(caption) # double cycle, intentional
- srt += str(n+1) + '\n'
- srt += start + ' --> ' + end + '\n'
- srt += caption + '\n\n'
- return srt
-
- def _extract_subtitles(self, video_id):
+ def _get_available_subtitles(self, video_id):
self.report_video_subtitles_download(video_id)
request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
try:
- srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
+ sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
- srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
- srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
- if not srt_lang_list:
- return (u'WARNING: video has no closed captions', None)
- if self._downloader.params.get('subtitleslang', False):
- srt_lang = self._downloader.params.get('subtitleslang')
- elif 'en' in srt_lang_list:
- srt_lang = 'en'
- else:
- srt_lang = list(srt_lang_list.keys())[0]
- if not srt_lang in srt_lang_list:
- return (u'WARNING: no closed captions found in the specified language', None)
+ return (u'unable to download video subtitles: %s' % compat_str(err), None)
+ sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
+ sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
+ if not sub_lang_list:
+ return (u'video doesn\'t have subtitles', None)
+ return sub_lang_list
+
+ def _list_available_subtitles(self, video_id):
+ sub_lang_list = self._get_available_subtitles(video_id)
+ self.report_video_subtitles_available(video_id, sub_lang_list)
+
+ def _request_subtitle(self, sub_lang, sub_name, video_id, format):
+ """
+ Return tuple:
+ (error_message, sub_lang, sub)
+ """
+ self.report_video_subtitles_request(video_id, sub_lang, format)
params = compat_urllib_parse.urlencode({
- 'lang': srt_lang,
- 'name': srt_lang_list[srt_lang].encode('utf-8'),
+ 'lang': sub_lang,
+ 'name': sub_name,
'v': video_id,
+ 'fmt': format,
})
url = 'http://www.youtube.com/api/timedtext?' + params
try:
- srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8')
+ sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
- if not srt_xml:
- return (u'WARNING: Did not fetch video subtitles', None)
- return (None, self._closed_captions_xml_to_srt(srt_xml))
+ return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
+ if not sub:
+ return (u'Did not fetch video subtitles', None, None)
+ return (None, sub_lang, sub)
+
+ def _extract_subtitle(self, video_id):
+ """
+ Return a list with a tuple:
+ [(error_message, sub_lang, sub)]
+ """
+ sub_lang_list = self._get_available_subtitles(video_id)
+ sub_format = self._downloader.params.get('subtitlesformat')
+ if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
+ return [(sub_lang_list[0], None, None)]
+ if self._downloader.params.get('subtitleslang', False):
+ sub_lang = self._downloader.params.get('subtitleslang')
+ elif 'en' in sub_lang_list:
+ sub_lang = 'en'
+ else:
+ sub_lang = list(sub_lang_list.keys())[0]
+ if not sub_lang in sub_lang_list:
+ return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
+
+ subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
+ return [subtitle]
+
+ def _extract_all_subtitles(self, video_id):
+ sub_lang_list = self._get_available_subtitles(video_id)
+ sub_format = self._downloader.params.get('subtitlesformat')
+ if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
+ return [(sub_lang_list[0], None, None)]
+ subtitles = []
+ for sub_lang in sub_lang_list:
+ subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
+ subtitles.append(subtitle)
+ return subtitles
def _print_formats(self, formats):
print('Available formats:')
@@ -308,7 +365,7 @@ class YoutubeIE(InfoExtractor):
else:
raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
except (IOError, netrc.NetrcParseError) as err:
- self._downloader.to_stderr(u'WARNING: parsing .netrc: %s' % compat_str(err))
+ self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
return
# Set language
@@ -317,7 +374,7 @@ class YoutubeIE(InfoExtractor):
self.report_lang()
compat_urllib_request.urlopen(request).read()
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.to_stderr(u'WARNING: unable to set language: %s' % compat_str(err))
+ self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
return
# No authentication to be performed
@@ -328,7 +385,7 @@ class YoutubeIE(InfoExtractor):
try:
login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.to_stderr(u'WARNING: unable to fetch login page: %s' % compat_str(err))
+ self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
return
galx = None
@@ -372,10 +429,10 @@ class YoutubeIE(InfoExtractor):
self.report_login()
login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
if re.search(r'(?i)