X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=84fca8ba0b2577696877c117a13fcc0a5ce40735;hb=280bc5dad651728e493b3b25a672a9aaef590683;hp=0dd50444456351b9753c3d08e69e0f24616e91cd;hpb=f45f96f8f83b7004960a269fea8bfab40a14c725;p=youtube-dl diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 0dd504444..84fca8ba0 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1,4 +1,6 @@ import base64 +import hashlib +import json import os import re import socket @@ -9,6 +11,7 @@ import xml.etree.ElementTree from ..utils import ( compat_http_client, compat_urllib_error, + compat_urllib_parse_urlparse, compat_str, clean_html, @@ -37,10 +40,12 @@ class InfoExtractor(object): id: Video identifier. title: Video title, unescaped. - Additionally, it must contain either a formats entry or url and ext: + Additionally, it must contain either a formats entry or a url one: - formats: A list of dictionaries for each format available, it must - be ordered from worst to best quality. Potential fields: + formats: A list of dictionaries for each format available, ordered + from worst to best quality. + + Potential fields: * url Mandatory. The URL of the video file * ext Will be calculated from url if missing * format A human-readable description of the format @@ -48,17 +53,34 @@ class InfoExtractor(object): Calculated from the format_id, width, height. and format_note fields if missing. * format_id A short description of the format - ("mp4_h264_opus" or "19") + ("mp4_h264_opus" or "19"). + Technically optional, but strongly recommended. * format_note Additional info about the format ("3D" or "DASH video") * width Width of the video, if known * height Height of the video, if known + * resolution Textual description of width and height + * tbr Average bitrate of audio and video in KBit/s * abr Average audio bitrate in KBit/s * acodec Name of the audio codec in use + * asr Audio sampling rate in Hertz * vbr Average video bitrate in KBit/s * vcodec Name of the video codec in use + * container Name of the container format * filesize The number of bytes, if known in advance * player_url SWF Player URL (used for rtmpdump). + * protocol The protocol that will be used for the actual + download, lower-case. + "http", "https", "rtsp", "rtmp", "m3u8" or so. + * preference Order number of this format. If this field is + present and not None, the formats get sorted + by this field. + -1 for default (order by other properties), + -2 or smaller for less than default. + * quality Order number of the video quality of this + format, irrespective of the file format. + -1 for default (order by other properties), + -2 or smaller for less than default. url: Final video URL. ext: Video filename extension. format: The video format, defaults to ext (used for --get-format) @@ -200,6 +222,8 @@ class InfoExtractor(object): webpage_bytes[:1024]) if m: encoding = m.group(1).decode('ascii') + elif webpage_bytes.startswith(b'\xff\xfe'): + encoding = 'utf-16' else: encoding = 'utf-8' if self._downloader.params.get('dump_intermediate_pages', False): @@ -215,6 +239,9 @@ class InfoExtractor(object): url = url_or_request.get_full_url() except AttributeError: url = url_or_request + if len(url) > 200: + h = u'___' + hashlib.md5(url.encode('utf-8')).hexdigest() + url = url[:200 - len(h)] + h raw_filename = ('%s_%s.dump' % (video_id, url)) filename = sanitize_filename(raw_filename, restricted=True) self.to_screen(u'Saving request to ' + filename) @@ -242,6 +269,18 @@ class InfoExtractor(object): xml_string = transform_source(xml_string) return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8')) + def _download_json(self, url_or_request, video_id, + note=u'Downloading JSON metadata', + errnote=u'Unable to download JSON metadata', + transform_source=None): + json_string = self._download_webpage(url_or_request, video_id, note, errnote) + if transform_source: + json_string = transform_source(json_string) + try: + return json.loads(json_string) + except ValueError as ve: + raise ExtractorError('Failed to download JSON', cause=ve) + def report_warning(self, msg, video_id=None): idstr = u'' if video_id is None else u'%s: ' % video_id self._downloader.report_warning( @@ -363,8 +402,8 @@ class InfoExtractor(object): # Helper functions for extracting OpenGraph info @staticmethod def _og_regexes(prop): - content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')' - property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop) + content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')' + property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop) template = r']+?%s[^>]+?%s' return [ template % (property_re, content_re), @@ -429,6 +468,65 @@ class InfoExtractor(object): } return RATING_TABLE.get(rating.lower(), None) + def _twitter_search_player(self, html): + return self._html_search_meta('twitter:player', html, + 'twitter card player') + + def _sort_formats(self, formats): + if not formats: + raise ExtractorError(u'No video formats found') + + def _formats_key(f): + # TODO remove the following workaround + from ..utils import determine_ext + if not f.get('ext') and 'url' in f: + f['ext'] = determine_ext(f['url']) + + preference = f.get('preference') + if preference is None: + proto = f.get('protocol') + if proto is None: + proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme + + preference = 0 if proto in ['http', 'https'] else -0.1 + if f.get('ext') in ['f4f', 'f4m']: # Not yet supported + preference -= 0.5 + + if f.get('vcodec') == 'none': # audio only + if self._downloader.params.get('prefer_free_formats'): + ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus'] + else: + ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a'] + ext_preference = 0 + try: + audio_ext_preference = ORDER.index(f['ext']) + except ValueError: + audio_ext_preference = -1 + else: + if self._downloader.params.get('prefer_free_formats'): + ORDER = [u'flv', u'mp4', u'webm'] + else: + ORDER = [u'webm', u'flv', u'mp4'] + try: + ext_preference = ORDER.index(f['ext']) + except ValueError: + ext_preference = -1 + audio_ext_preference = 0 + + return ( + preference, + f.get('quality') if f.get('quality') is not None else -1, + f.get('height') if f.get('height') is not None else -1, + f.get('width') if f.get('width') is not None else -1, + ext_preference, + f.get('tbr') if f.get('tbr') is not None else -1, + f.get('vbr') if f.get('vbr') is not None else -1, + f.get('abr') if f.get('abr') is not None else -1, + audio_ext_preference, + f.get('filesize') if f.get('filesize') is not None else -1, + f.get('format_id'), + ) + formats.sort(key=_formats_key) class SearchInfoExtractor(InfoExtractor):