X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=52c00186e458fd131e28eb178ea09df8e9d9ce0d;hb=ff21a8e0ee43d4ce0b75cd938f9bdfab664dd579;hp=78f238f8428c5df0fce2dcc26f66b1301595e62b;hpb=38d63d846edbf4c897a7df6e2ee545002e1203ca;p=youtube-dl diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 78f238f84..cd155a090 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1,22 +1,32 @@ +from __future__ import unicode_literals + import base64 +import datetime import hashlib import json +import netrc import os import re import socket import sys -import netrc +import time import xml.etree.ElementTree -from ..utils import ( +from ..compat import ( + compat_cookiejar, compat_http_client, compat_urllib_error, compat_urllib_parse_urlparse, + compat_urlparse, compat_str, - +) +from ..utils import ( + age_restricted, clean_html, compiled_regex_type, ExtractorError, + float_or_none, + int_or_none, RegexNotFoundError, sanitize_filename, unescapeHTML, @@ -31,11 +41,15 @@ class InfoExtractor(object): information about the video (or videos) the URL refers to. This information includes the real video URL, the video title, author and others. The information is stored in a dictionary which is then - passed to the FileDownloader. The FileDownloader processes this + passed to the YoutubeDL. The YoutubeDL processes this information possibly downloading the video to the file system, among other possible outcomes. - The dictionaries must include the following fields: + The type field determines the the type of the result. + By far the most common value (and the default if _type is missing) is + "video", which indicates a single video. + + For a video, the dictionaries must include the following fields: id: Video identifier. title: Video title, unescaped. @@ -65,9 +79,11 @@ class InfoExtractor(object): * acodec Name of the audio codec in use * asr Audio sampling rate in Hertz * vbr Average video bitrate in KBit/s + * fps Frame rate * vcodec Name of the video codec in use * container Name of the container format * filesize The number of bytes, if known in advance + * filesize_approx An estimate for the number of bytes * player_url SWF Player URL (used for rtmpdump). * protocol The protocol that will be used for the actual download, lower-case. @@ -77,10 +93,27 @@ class InfoExtractor(object): by this field, regardless of all other values. -1 for default (order by other properties), -2 or smaller for less than default. + < -1000 to hide the format (if there is + another one which is strictly better) + * language_preference Is this in the correct requested + language? + 10 if it's what the URL is about, + -1 for default (don't know), + -10 otherwise, other values reserved for now. * quality Order number of the video quality of this format, irrespective of the file format. -1 for default (order by other properties), -2 or smaller for less than default. + * source_preference Order number for this video source + (quality takes higher priority) + -1 for default (order by other properties), + -2 or smaller for less than default. + * http_referer HTTP Referer header value to set. + * http_method HTTP method to use for the download. + * http_headers A dictionary of additional HTTP headers + to add to the request. + * http_post_data Additional data to send with a POST + request. url: Final video URL. ext: Video filename extension. format: The video format, defaults to ext (used for --get-format) @@ -88,20 +121,25 @@ class InfoExtractor(object): The following fields are optional: + alt_title: A secondary title of the video. display_id An alternative identifier for the video, not necessarily unique, but available before title. Typically, id is something like "4234987", title "Dancing naked mole rats", and display_id "dancing-naked-mole-rats" - thumbnails: A list of dictionaries (with the entries "resolution" and - "url") for the varying thumbnails + thumbnails: A list of dictionaries, with the following entries: + * "url" + * "width" (optional, int) + * "height" (optional, int) + * "resolution" (optional, string "{width}x{height"}, + deprecated) thumbnail: Full URL to a video thumbnail image. - description: One-line video description. + description: Full video description. uploader: Full name of the video uploader. timestamp: UNIX timestamp of the moment the video became available. upload_date: Video upload date (YYYYMMDD). If not explicitly set, calculated from timestamp. uploader_id: Nickname or id of the video uploader. - location: Physical location of the video. + location: Physical location where the video was filmed. subtitles: The subtitle file contents as a dictionary in the format {language: subtitles}. duration: Length of the video in seconds, as an integer. @@ -109,13 +147,63 @@ class InfoExtractor(object): like_count: Number of positive ratings of the video dislike_count: Number of negative ratings of the video comment_count: Number of comments on the video + comments: A list of comments, each with one or more of the following + properties (all but one of text or html optional): + * "author" - human-readable name of the comment author + * "author_id" - user ID of the comment author + * "id" - Comment ID + * "html" - Comment as HTML + * "text" - Plain text of the comment + * "timestamp" - UNIX timestamp of comment + * "parent" - ID of the comment this one is replying to. + Set to "root" to indicate that this is a + comment to the original video. age_limit: Age restriction for the video, as an integer (years) webpage_url: The url to the video webpage, if given to youtube-dl it should allow to get the same result again. (It will be set by YoutubeDL if it's missing) + categories: A list of categories that the video falls in, for example + ["Sports", "Berlin"] + is_live: True, False, or None (=unknown). Whether this video is a + live stream that goes on instead of a fixed-length video. Unless mentioned otherwise, the fields should be Unicode strings. + Unless mentioned otherwise, None is equivalent to absence of information. + + + _type "playlist" indicates multiple videos. + There must be a key "entries", which is a list, an iterable, or a PagedList + object, each element of which is a valid dictionary by this specification. + + Additionally, playlists can have "title" and "id" attributes with the same + semantics as videos (see above). + + + _type "multi_video" indicates that there are multiple videos that + form a single show, for examples multiple acts of an opera or TV episode. + It must have an entries key like a playlist and contain all the keys + required for a video at the same time. + + + _type "url" indicates that the video must be extracted from another + location, possibly by a different extractor. Its only required key is: + "url" - the next URL to extract. + The key "ie_key" can be set to the class name (minus the trailing "IE", + e.g. "Youtube") if the extractor class is known in advance. + Additionally, the dictionary may have any properties of the resolved entity + known in advance, for example "title" if the title of the referred video is + known ahead of time. + + + _type "url_transparent" entities have the same specification as "url", but + indicate that the given additional information is more precise than the one + associated with the resolved URL. + This is useful when a site employs a video service that hosts the video and + its technical metadata, but that video service does not embed a useful + title, description etc. + + Subclasses of this one should re-define the _real_initialize() and _real_extract() methods and define a _VALID_URL regexp. Probably, they should also be added to the list of extractors. @@ -144,6 +232,14 @@ class InfoExtractor(object): cls._VALID_URL_RE = re.compile(cls._VALID_URL) return cls._VALID_URL_RE.match(url) is not None + @classmethod + def _match_id(cls, url): + if '_VALID_URL_RE' not in cls.__dict__: + cls._VALID_URL_RE = re.compile(cls._VALID_URL) + m = cls._VALID_URL_RE.match(url) + assert m + return m.group('id') + @classmethod def working(cls): """Getter method for _WORKING.""" @@ -187,17 +283,17 @@ class InfoExtractor(object): self.report_download_webpage(video_id) elif note is not False: if video_id is None: - self.to_screen(u'%s' % (note,)) + self.to_screen('%s' % (note,)) else: - self.to_screen(u'%s: %s' % (video_id, note)) + self.to_screen('%s: %s' % (video_id, note)) try: return self._downloader.urlopen(url_or_request) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: if errnote is False: return False if errnote is None: - errnote = u'Unable to download webpage' - errmsg = u'%s: %s' % (errnote, compat_str(err)) + errnote = 'Unable to download webpage' + errmsg = '%s: %s' % (errnote, compat_str(err)) if fatal: raise ExtractorError(errmsg, sys.exc_info()[2], cause=err) else: @@ -206,7 +302,6 @@ class InfoExtractor(object): def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True): """ Returns a tuple (page content as string, URL handle) """ - # Strip hashes from the URL (#1038) if isinstance(url_or_request, (compat_str, str)): url_or_request = url_or_request.partition('#')[0] @@ -215,8 +310,14 @@ class InfoExtractor(object): if urlh is False: assert not fatal return False + content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal) + return (content, urlh) + + def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None): content_type = urlh.headers.get('Content-Type', '') webpage_bytes = urlh.read() + if prefix is not None: + webpage_bytes = prefix + webpage_bytes m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) if m: encoding = m.group(1) @@ -234,7 +335,7 @@ class InfoExtractor(object): url = url_or_request.get_full_url() except AttributeError: url = url_or_request - self.to_screen(u'Dumping request to ' + url) + self.to_screen('Dumping request to ' + url) dump = base64.b64encode(webpage_bytes).decode('ascii') self._downloader.to_screen(dump) if self._downloader.params.get('write_pages', False): @@ -242,21 +343,52 @@ class InfoExtractor(object): url = url_or_request.get_full_url() except AttributeError: url = url_or_request - if len(url) > 200: - h = u'___' + hashlib.md5(url.encode('utf-8')).hexdigest() - url = url[:200 - len(h)] + h - raw_filename = ('%s_%s.dump' % (video_id, url)) + basen = '%s_%s' % (video_id, url) + if len(basen) > 240: + h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest() + basen = basen[:240 - len(h)] + h + raw_filename = basen + '.dump' filename = sanitize_filename(raw_filename, restricted=True) - self.to_screen(u'Saving request to ' + filename) + self.to_screen('Saving request to ' + filename) + # Working around MAX_PATH limitation on Windows (see + # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx) + if os.name == 'nt': + absfilepath = os.path.abspath(filename) + if len(absfilepath) > 259: + filename = '\\\\?\\' + absfilepath with open(filename, 'wb') as outf: outf.write(webpage_bytes) - content = webpage_bytes.decode(encoding, 'replace') - return (content, urlh) - - def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True): + try: + content = webpage_bytes.decode(encoding, 'replace') + except LookupError: + content = webpage_bytes.decode('utf-8', 'replace') + + if ('Access to this site is blocked' in content and + 'Websense' in content[:512]): + msg = 'Access to this webpage has been blocked by Websense filtering software in your network.' + blocked_iframe = self._html_search_regex( + r'