X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=f7478d4598e8f15ea802ec9451aff53ce452fe7a;hb=a3978a615950af6e990313820f93baddce067ee4;hp=0a90382de116bdffdb88edfc0ebe1a38fb758dd6;hpb=e6812ac99dfc9117f3dd78425bf4bbe01601ecfd;p=youtube-dl diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 0a90382de..f7478d459 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1,4 +1,6 @@ import base64 +import hashlib +import json import os import re import socket @@ -9,6 +11,7 @@ import xml.etree.ElementTree from ..utils import ( compat_http_client, compat_urllib_error, + compat_urllib_parse_urlparse, compat_str, clean_html, @@ -50,20 +53,32 @@ class InfoExtractor(object): Calculated from the format_id, width, height. and format_note fields if missing. * format_id A short description of the format - ("mp4_h264_opus" or "19") + ("mp4_h264_opus" or "19"). + Technically optional, but strongly recommended. * format_note Additional info about the format ("3D" or "DASH video") * width Width of the video, if known * height Height of the video, if known * resolution Textual description of width and height + * tbr Average bitrate of audio and video in KBit/s * abr Average audio bitrate in KBit/s * acodec Name of the audio codec in use + * asr Audio sampling rate in Hertz * vbr Average video bitrate in KBit/s * vcodec Name of the video codec in use + * container Name of the container format * filesize The number of bytes, if known in advance * player_url SWF Player URL (used for rtmpdump). + * protocol The protocol that will be used for the actual + download, lower-case. + "http", "https", "rtsp", "rtmp", "m3u8" or so. * preference Order number of this format. If this field is - present, the formats get sorted by this field. + present and not None, the formats get sorted + by this field. + -1 for default (order by other properties), + -2 or smaller for less than default. + * quality Order number of the video quality of this + format, irrespective of the file format. -1 for default (order by other properties), -2 or smaller for less than default. url: Final video URL. @@ -207,6 +222,8 @@ class InfoExtractor(object): webpage_bytes[:1024]) if m: encoding = m.group(1).decode('ascii') + elif webpage_bytes.startswith(b'\xff\xfe'): + encoding = 'utf-16' else: encoding = 'utf-8' if self._downloader.params.get('dump_intermediate_pages', False): @@ -222,6 +239,9 @@ class InfoExtractor(object): url = url_or_request.get_full_url() except AttributeError: url = url_or_request + if len(url) > 200: + h = u'___' + hashlib.md5(url.encode('utf-8')).hexdigest() + url = url[:200 - len(h)] + h raw_filename = ('%s_%s.dump' % (video_id, url)) filename = sanitize_filename(raw_filename, restricted=True) self.to_screen(u'Saving request to ' + filename) @@ -249,6 +269,15 @@ class InfoExtractor(object): xml_string = transform_source(xml_string) return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8')) + def _download_json(self, url_or_request, video_id, + note=u'Downloading JSON metadata', + errnote=u'Unable to download JSON metadata'): + json_string = self._download_webpage(url_or_request, video_id, note, errnote) + try: + return json.loads(json_string) + except ValueError as ve: + raise ExtractorError('Failed to download JSON', cause=ve) + def report_warning(self, msg, video_id=None): idstr = u'' if video_id is None else u'%s: ' % video_id self._downloader.report_warning( @@ -371,7 +400,7 @@ class InfoExtractor(object): @staticmethod def _og_regexes(prop): content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')' - property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop) + property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop) template = r']+?%s[^>]+?%s' return [ template % (property_re, content_re), @@ -437,6 +466,9 @@ class InfoExtractor(object): return RATING_TABLE.get(rating.lower(), None) def _sort_formats(self, formats): + if not formats: + raise ExtractorError(u'No video formats found') + def _formats_key(f): # TODO remove the following workaround from ..utils import determine_ext @@ -445,7 +477,11 @@ class InfoExtractor(object): preference = f.get('preference') if preference is None: - preference = 0 if f.get('url', '').startswith('http') else -0.1 + proto = f.get('protocol') + if proto is None: + proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme + + preference = 0 if proto in ['http', 'https'] else -0.1 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported preference -= 0.5 @@ -472,9 +508,11 @@ class InfoExtractor(object): return ( preference, + f.get('quality') if f.get('quality') is not None else -1, f.get('height') if f.get('height') is not None else -1, f.get('width') if f.get('width') is not None else -1, ext_preference, + f.get('tbr') if f.get('tbr') is not None else -1, f.get('vbr') if f.get('vbr') is not None else -1, f.get('abr') if f.get('abr') is not None else -1, audio_ext_preference,