X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2FInfoExtractors.py;h=e3998fbe88173ae1cb949d2f6256fa0fc06fb530;hb=2e5d60b7db7020b726cd54ee4cad8f2afbd1479d;hp=b1ede697a7bedd5260072450f57a8939988c8e28;hpb=8f6f40d99180ab00c918a79641a1e5508e90c76a;p=youtube-dl
diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py
old mode 100644
new mode 100755
index b1ede697a..e3998fbe8
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -3,7 +3,9 @@
from __future__ import absolute_import
+import base64
import datetime
+import itertools
import netrc
import os
import re
@@ -23,7 +25,7 @@ class InfoExtractor(object):
Information extractors are the classes that, given a URL, extract
information about the video (or videos) the URL refers to. This
information includes the real video URL, the video title, author and
- others. The information is stored in a dictionary which is then
+ others. The information is stored in a dictionary which is then
passed to the FileDownloader. The FileDownloader processes this
information possibly downloading the video to the file system, among
other possible outcomes.
@@ -32,8 +34,6 @@ class InfoExtractor(object):
id: Video identifier.
url: Final video URL.
- uploader: Nickname of the video uploader, unescaped.
- upload_date: Video upload date (YYYYMMDD).
title: Video title, unescaped.
ext: Video filename extension.
@@ -42,6 +42,10 @@ class InfoExtractor(object):
format: The video format, defaults to ext (used for --get-format)
thumbnail: Full URL to a video thumbnail image.
description: One-line video description.
+ uploader: Full name of the video uploader.
+ upload_date: Video upload date (YYYYMMDD).
+ uploader_id: Nickname or id of the video uploader.
+ location: Physical location of the video.
player_url: SWF Player URL (used for rtmpdump).
subtitles: The .srt file contents.
urlhandle: [internal] The urlHandle to be used to download the file,
@@ -100,6 +104,28 @@ class InfoExtractor(object):
"""Real extraction process. Redefine in subclasses."""
pass
+ @property
+ def IE_NAME(self):
+ return type(self).__name__[:-2]
+
+ def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
+ """ Returns the response handle """
+ if note is None:
+ note = u'Downloading video webpage'
+ self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
+ try:
+ return compat_urllib_request.urlopen(url_or_request)
+ except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+ if errnote is None:
+ errnote = u'Unable to download webpage'
+ raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2])
+
+ def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
+ """ Returns the data of the page as a string """
+ urlh = self._request_webpage(url_or_request, video_id, note, errnote)
+ webpage_bytes = urlh.read()
+ return webpage_bytes.decode('utf-8', 'replace')
+
class YoutubeIE(InfoExtractor):
"""Information extractor for youtube.com."""
@@ -116,7 +142,7 @@ class YoutubeIE(InfoExtractor):
|(?: # or the v= param in all its forms
(?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
(?:\?|\#!?) # the params delimiter ? or # or #!
- (?:.+&)? # any other preceding param (like /?s=tuff&v=xxxx)
+ (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
v=
)
)? # optional -> youtube.com/xxxx is OK
@@ -125,7 +151,7 @@ class YoutubeIE(InfoExtractor):
(?(1).+)? # if we found the ID, everything can follow
$"""
_LANG_URL = r'http://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
- _LOGIN_URL = 'https://www.youtube.com/signup?next=/&gl=US&hl=en'
+ _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
_AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
_NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
_NETRC_MACHINE = 'youtube'
@@ -159,7 +185,7 @@ class YoutubeIE(InfoExtractor):
'44': '480x854',
'45': '720x1280',
'46': '1080x1920',
- }
+ }
IE_NAME = u'youtube'
def suitable(self, url):
@@ -202,22 +228,39 @@ class YoutubeIE(InfoExtractor):
"""Indicate the download will use the RTMP protocol."""
self._downloader.to_screen(u'[youtube] RTMP download detected')
- def _closed_captions_xml_to_srt(self, xml_string):
- srt = ''
- texts = re.findall(r'([^<]+)', xml_string, re.MULTILINE)
- # TODO parse xml instead of regex
- for n, (start, dur_tag, dur, caption) in enumerate(texts):
- if not dur: dur = '4'
- start = float(start)
- end = start + float(dur)
- start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
- end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
- caption = unescapeHTML(caption)
- caption = unescapeHTML(caption) # double cycle, intentional
- srt += str(n+1) + '\n'
- srt += start + ' --> ' + end + '\n'
- srt += caption + '\n\n'
- return srt
+ def _extract_subtitles(self, video_id):
+ self.report_video_subtitles_download(video_id)
+ request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
+ try:
+ srt_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
+ except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+ return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
+ srt_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', srt_list)
+ srt_lang_list = dict((l[1], l[0]) for l in srt_lang_list)
+ if not srt_lang_list:
+ return (u'WARNING: video has no closed captions', None)
+ if self._downloader.params.get('subtitleslang', False):
+ srt_lang = self._downloader.params.get('subtitleslang')
+ elif 'en' in srt_lang_list:
+ srt_lang = 'en'
+ else:
+ srt_lang = list(srt_lang_list.keys())[0]
+ if not srt_lang in srt_lang_list:
+ return (u'WARNING: no closed captions found in the specified language', None)
+ params = compat_urllib_parse.urlencode({
+ 'lang': srt_lang,
+ 'name': srt_lang_list[srt_lang].encode('utf-8'),
+ 'v': video_id,
+ 'fmt': 'srt',
+ })
+ url = 'http://www.youtube.com/api/timedtext?' + params
+ try:
+ srt = compat_urllib_request.urlopen(url).read().decode('utf-8')
+ except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+ return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
+ if not srt:
+ return (u'WARNING: Did not fetch video subtitles', None)
+ return (None, srt)
def _print_formats(self, formats):
print('Available formats:')
@@ -261,19 +304,54 @@ class YoutubeIE(InfoExtractor):
if username is None:
return
+ request = compat_urllib_request.Request(self._LOGIN_URL)
+ try:
+ login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
+ except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+ self._downloader.to_stderr(u'WARNING: unable to fetch login page: %s' % compat_str(err))
+ return
+
+ galx = None
+ dsh = None
+ match = re.search(re.compile(r']* name="loginForm"', login_results) is not None:
+ login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
+ if re.search(r'(?i)