X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=78f238f8428c5df0fce2dcc26f66b1301595e62b;hb=72546c831e460307570ae292053c8395e2bbf0ef;hp=692d828da9ef9739e1b05908e6a9f39259b0940b;hpb=dfa50793d8541ff2c5603f7c3b727c0f6e551d8d;p=youtube-dl diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 692d828da..78f238f84 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -63,16 +63,18 @@ class InfoExtractor(object): * tbr Average bitrate of audio and video in KBit/s * abr Average audio bitrate in KBit/s * acodec Name of the audio codec in use + * asr Audio sampling rate in Hertz * vbr Average video bitrate in KBit/s * vcodec Name of the video codec in use + * container Name of the container format * filesize The number of bytes, if known in advance * player_url SWF Player URL (used for rtmpdump). * protocol The protocol that will be used for the actual download, lower-case. - "http", "https", "rtsp", "rtmp" or so. + "http", "https", "rtsp", "rtmp", "m3u8" or so. * preference Order number of this format. If this field is present and not None, the formats get sorted - by this field. + by this field, regardless of all other values. -1 for default (order by other properties), -2 or smaller for less than default. * quality Order number of the video quality of this @@ -86,12 +88,18 @@ class InfoExtractor(object): The following fields are optional: + display_id An alternative identifier for the video, not necessarily + unique, but available before title. Typically, id is + something like "4234987", title "Dancing naked mole rats", + and display_id "dancing-naked-mole-rats" thumbnails: A list of dictionaries (with the entries "resolution" and "url") for the varying thumbnails thumbnail: Full URL to a video thumbnail image. description: One-line video description. uploader: Full name of the video uploader. + timestamp: UNIX timestamp of the moment the video became available. upload_date: Video upload date (YYYYMMDD). + If not explicitly set, calculated from timestamp. uploader_id: Nickname or id of the video uploader. location: Physical location of the video. subtitles: The subtitle file contents as a dictionary in the format @@ -112,9 +120,6 @@ class InfoExtractor(object): _real_extract() methods and define a _VALID_URL regexp. Probably, they should also be added to the list of extractors. - _real_extract() must return a *list* of information dictionaries as - described above. - Finally, the _WORKING attribute should be set to False for broken IEs in order to warn the users and skip the tests. """ @@ -220,6 +225,8 @@ class InfoExtractor(object): webpage_bytes[:1024]) if m: encoding = m.group(1).decode('ascii') + elif webpage_bytes.startswith(b'\xff\xfe'): + encoding = 'utf-16' else: encoding = 'utf-8' if self._downloader.params.get('dump_intermediate_pages', False): @@ -236,7 +243,7 @@ class InfoExtractor(object): except AttributeError: url = url_or_request if len(url) > 200: - h = hashlib.md5(url).hexdigest() + h = u'___' + hashlib.md5(url.encode('utf-8')).hexdigest() url = url[:200 - len(h)] + h raw_filename = ('%s_%s.dump' % (video_id, url)) filename = sanitize_filename(raw_filename, restricted=True) @@ -267,8 +274,11 @@ class InfoExtractor(object): def _download_json(self, url_or_request, video_id, note=u'Downloading JSON metadata', - errnote=u'Unable to download JSON metadata'): + errnote=u'Unable to download JSON metadata', + transform_source=None): json_string = self._download_webpage(url_or_request, video_id, note, errnote) + if transform_source: + json_string = transform_source(json_string) try: return json.loads(json_string) except ValueError as ve: @@ -395,7 +405,7 @@ class InfoExtractor(object): # Helper functions for extracting OpenGraph info @staticmethod def _og_regexes(prop): - content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')' + content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')' property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop) template = r']+?%s[^>]+?%s' return [ @@ -425,14 +435,14 @@ class InfoExtractor(object): if secure: regexes = self._og_regexes('video:secure_url') + regexes return self._html_search_regex(regexes, html, name, **kargs) - def _html_search_meta(self, name, html, display_name=None): + def _html_search_meta(self, name, html, display_name=None, fatal=False): if display_name is None: display_name = name return self._html_search_regex( r'''(?ix)]+(?:itemprop|name|property)=["\']%s["\']) [^>]+content=["\']([^"\']+)["\']''' % re.escape(name), - html, display_name, fatal=False) + html, display_name, fatal=fatal) def _dc_search_uploader(self, html): return self._html_search_meta('dc.creator', html, 'uploader') @@ -461,7 +471,14 @@ class InfoExtractor(object): } return RATING_TABLE.get(rating.lower(), None) + def _twitter_search_player(self, html): + return self._html_search_meta('twitter:player', html, + 'twitter card player') + def _sort_formats(self, formats): + if not formats: + raise ExtractorError(u'No video formats found') + def _formats_key(f): # TODO remove the following workaround from ..utils import determine_ext