X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=9e1d62c2b908f0c8ac826a5d16fba10c4645b8b2;hb=8c25f81beea169c9d6540eea1a6f71dc045da6ed;hp=9b36e07891524c627be2b20a1adbd5269d57930a;hpb=7dabd2ac45a53bc608390a72c2d43044aaf6efb8;p=youtube-dl diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 9b36e0789..9e1d62c2b 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1,4 +1,7 @@ +from __future__ import unicode_literals + import base64 +import datetime import hashlib import json import netrc @@ -9,15 +12,19 @@ import sys import time import xml.etree.ElementTree -from ..utils import ( +from ..compat import ( compat_http_client, compat_urllib_error, compat_urllib_parse_urlparse, + compat_urlparse, compat_str, - +) +from ..utils import ( clean_html, compiled_regex_type, ExtractorError, + float_or_none, + int_or_none, RegexNotFoundError, sanitize_filename, unescapeHTML, @@ -66,6 +73,7 @@ class InfoExtractor(object): * acodec Name of the audio codec in use * asr Audio sampling rate in Hertz * vbr Average video bitrate in KBit/s + * fps Frame rate * vcodec Name of the video codec in use * container Name of the container format * filesize The number of bytes, if known in advance @@ -83,6 +91,16 @@ class InfoExtractor(object): format, irrespective of the file format. -1 for default (order by other properties), -2 or smaller for less than default. + * source_preference Order number for this video source + (quality takes higher priority) + -1 for default (order by other properties), + -2 or smaller for less than default. + * http_referer HTTP Referer header value to set. + * http_method HTTP method to use for the download. + * http_headers A dictionary of additional HTTP headers + to add to the request. + * http_post_data Additional data to send with a POST + request. url: Final video URL. ext: Video filename extension. format: The video format, defaults to ext (used for --get-format) @@ -107,7 +125,7 @@ class InfoExtractor(object): upload_date: Video upload date (YYYYMMDD). If not explicitly set, calculated from timestamp. uploader_id: Nickname or id of the video uploader. - location: Physical location of the video. + location: Physical location where the video was filmed. subtitles: The subtitle file contents as a dictionary in the format {language: subtitles}. duration: Length of the video in seconds, as an integer. @@ -121,9 +139,13 @@ class InfoExtractor(object): by YoutubeDL if it's missing) categories: A list of categories that the video falls in, for example ["Sports", "Berlin"] + is_live: True, False, or None (=unknown). Whether this video is a + live stream that goes on instead of a fixed-length video. Unless mentioned otherwise, the fields should be Unicode strings. + Unless mentioned otherwise, None is equivalent to absence of information. + Subclasses of this one should re-define the _real_initialize() and _real_extract() methods and define a _VALID_URL regexp. Probably, they should also be added to the list of extractors. @@ -152,6 +174,14 @@ class InfoExtractor(object): cls._VALID_URL_RE = re.compile(cls._VALID_URL) return cls._VALID_URL_RE.match(url) is not None + @classmethod + def _match_id(cls, url): + if '_VALID_URL_RE' not in cls.__dict__: + cls._VALID_URL_RE = re.compile(cls._VALID_URL) + m = cls._VALID_URL_RE.match(url) + assert m + return m.group('id') + @classmethod def working(cls): """Getter method for _WORKING.""" @@ -195,17 +225,17 @@ class InfoExtractor(object): self.report_download_webpage(video_id) elif note is not False: if video_id is None: - self.to_screen(u'%s' % (note,)) + self.to_screen('%s' % (note,)) else: - self.to_screen(u'%s: %s' % (video_id, note)) + self.to_screen('%s: %s' % (video_id, note)) try: return self._downloader.urlopen(url_or_request) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: if errnote is False: return False if errnote is None: - errnote = u'Unable to download webpage' - errmsg = u'%s: %s' % (errnote, compat_str(err)) + errnote = 'Unable to download webpage' + errmsg = '%s: %s' % (errnote, compat_str(err)) if fatal: raise ExtractorError(errmsg, sys.exc_info()[2], cause=err) else: @@ -214,7 +244,6 @@ class InfoExtractor(object): def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True): """ Returns a tuple (page content as string, URL handle) """ - # Strip hashes from the URL (#1038) if isinstance(url_or_request, (compat_str, str)): url_or_request = url_or_request.partition('#')[0] @@ -223,6 +252,10 @@ class InfoExtractor(object): if urlh is False: assert not fatal return False + content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal) + return (content, urlh) + + def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True): content_type = urlh.headers.get('Content-Type', '') webpage_bytes = urlh.read() m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) @@ -242,7 +275,7 @@ class InfoExtractor(object): url = url_or_request.get_full_url() except AttributeError: url = url_or_request - self.to_screen(u'Dumping request to ' + url) + self.to_screen('Dumping request to ' + url) dump = base64.b64encode(webpage_bytes).decode('ascii') self._downloader.to_screen(dump) if self._downloader.params.get('write_pages', False): @@ -252,11 +285,17 @@ class InfoExtractor(object): url = url_or_request basen = '%s_%s' % (video_id, url) if len(basen) > 240: - h = u'___' + hashlib.md5(basen.encode('utf-8')).hexdigest() + h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest() basen = basen[:240 - len(h)] + h raw_filename = basen + '.dump' filename = sanitize_filename(raw_filename, restricted=True) - self.to_screen(u'Saving request to ' + filename) + self.to_screen('Saving request to ' + filename) + # Working around MAX_PATH limitation on Windows (see + # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx) + if os.name == 'nt': + absfilepath = os.path.abspath(filename) + if len(absfilepath) > 259: + filename = '\\\\?\\' + absfilepath with open(filename, 'wb') as outf: outf.write(webpage_bytes) @@ -265,17 +304,17 @@ class InfoExtractor(object): except LookupError: content = webpage_bytes.decode('utf-8', 'replace') - if (u'Access to this site is blocked' in content and - u'Websense' in content[:512]): - msg = u'Access to this webpage has been blocked by Websense filtering software in your network.' + if ('Access to this site is blocked' in content and + 'Websense' in content[:512]): + msg = 'Access to this webpage has been blocked by Websense filtering software in your network.' blocked_iframe = self._html_search_regex( r'