X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=d859aea5285f331c43e44b95b41a12aee0f41f87;hb=bf20b9c5405b276e6de29b1b28b2e3ad2182e891;hp=8ed97f8dddfc5d9b51dab2630008c36e054a50bf;hpb=830d53bfae7a665b55656dd50c9f35f0d0b0161d;p=youtube-dl diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 8ed97f8dd..d859aea52 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -22,7 +22,9 @@ from ..compat import ( compat_str, ) from ..utils import ( + NO_DEFAULT, age_restricted, + bug_reports_message, clean_html, compiled_regex_type, ExtractorError, @@ -32,7 +34,6 @@ from ..utils import ( sanitize_filename, unescapeHTML, ) -_NO_DEFAULT = object() class InfoExtractor(object): @@ -46,7 +47,7 @@ class InfoExtractor(object): information possibly downloading the video to the file system, among other possible outcomes. - The type field determines the the type of the result. + The type field determines the type of the result. By far the most common value (and the default if _type is missing) is "video", which indicates a single video. @@ -110,11 +111,8 @@ class InfoExtractor(object): (quality takes higher priority) -1 for default (order by other properties), -2 or smaller for less than default. - * http_method HTTP method to use for the download. * http_headers A dictionary of additional HTTP headers to add to the request. - * http_post_data Additional data to send with a POST - request. * stretched_ratio If given and not 1, indicates that the video's pixels are not square. width : height ratio as float. @@ -324,7 +322,7 @@ class InfoExtractor(object): self._downloader.report_warning(errmsg) return False - def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True): + def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None): """ Returns a tuple (page content as string, URL handle) """ # Strip hashes from the URL (#1038) if isinstance(url_or_request, (compat_str, str)): @@ -334,14 +332,11 @@ class InfoExtractor(object): if urlh is False: assert not fatal return False - content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal) + content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding) return (content, urlh) - def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None): - content_type = urlh.headers.get('Content-Type', '') - webpage_bytes = urlh.read() - if prefix is not None: - webpage_bytes = prefix + webpage_bytes + @staticmethod + def _guess_encoding_from_content(content_type, webpage_bytes): m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) if m: encoding = m.group(1) @@ -354,6 +349,16 @@ class InfoExtractor(object): encoding = 'utf-16' else: encoding = 'utf-8' + + return encoding + + def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None): + content_type = urlh.headers.get('Content-Type', '') + webpage_bytes = urlh.read() + if prefix is not None: + webpage_bytes = prefix + webpage_bytes + if not encoding: + encoding = self._guess_encoding_from_content(content_type, webpage_bytes) if self._downloader.params.get('dump_intermediate_pages', False): try: url = url_or_request.get_full_url() @@ -410,13 +415,13 @@ class InfoExtractor(object): return content - def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5): + def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None): """ Returns the data of the page as a string """ success = False try_count = 0 while success is False: try: - res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal) + res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding) success = True except compat_http_client.IncompleteRead as e: try_count += 1 @@ -431,10 +436,10 @@ class InfoExtractor(object): def _download_xml(self, url_or_request, video_id, note='Downloading XML', errnote='Unable to download XML', - transform_source=None, fatal=True): + transform_source=None, fatal=True, encoding=None): """Return the xml as an xml.etree.ElementTree.Element""" xml_string = self._download_webpage( - url_or_request, video_id, note, errnote, fatal=fatal) + url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding) if xml_string is False: return xml_string if transform_source: @@ -445,9 +450,10 @@ class InfoExtractor(object): note='Downloading JSON metadata', errnote='Unable to download JSON metadata', transform_source=None, - fatal=True): + fatal=True, encoding=None): json_string = self._download_webpage( - url_or_request, video_id, note, errnote, fatal=fatal) + url_or_request, video_id, note, errnote, fatal=fatal, + encoding=encoding) if (not fatal) and json_string is False: return None return self._parse_json( @@ -517,7 +523,7 @@ class InfoExtractor(object): video_info['description'] = playlist_description return video_info - def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None): + def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): """ Perform a regex search on the given string, using a single or a list of patterns returning the first matching group. @@ -543,16 +549,15 @@ class InfoExtractor(object): return next(g for g in mobj.groups() if g is not None) else: return mobj.group(group) - elif default is not _NO_DEFAULT: + elif default is not NO_DEFAULT: return default elif fatal: raise RegexNotFoundError('Unable to extract %s' % _name) else: - self._downloader.report_warning('unable to extract %s; ' - 'please report this issue on http://yt-dl.org/bug' % _name) + self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message()) return None - def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None): + def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): """ Like _search_regex, but strips HTML tags and unescapes entities. """ @@ -564,7 +569,7 @@ class InfoExtractor(object): def _get_login_info(self): """ - Get the the login info as (username, password) + Get the login info as (username, password) It will look in the netrc file using the _NETRC_MACHINE value If there's no info available, return (None, None) """ @@ -700,7 +705,7 @@ class InfoExtractor(object): return self._html_search_meta('twitter:player', html, 'twitter card player') - def _sort_formats(self, formats): + def _sort_formats(self, formats, field_preference=None): if not formats: raise ExtractorError('No video formats found') @@ -710,6 +715,9 @@ class InfoExtractor(object): if not f.get('ext') and 'url' in f: f['ext'] = determine_ext(f['url']) + if isinstance(field_preference, (list, tuple)): + return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference) + preference = f.get('preference') if preference is None: proto = f.get('protocol') @@ -756,7 +764,7 @@ class InfoExtractor(object): f.get('fps') if f.get('fps') is not None else -1, f.get('filesize_approx') if f.get('filesize_approx') is not None else -1, f.get('source_preference') if f.get('source_preference') is not None else -1, - f.get('format_id'), + f.get('format_id') if f.get('format_id') is not None else '', ) formats.sort(key=_formats_key) @@ -778,8 +786,8 @@ class InfoExtractor(object): return True except ExtractorError as e: if isinstance(e.cause, compat_HTTPError): - self.report_warning( - '%s URL is invalid, skipping' % item, video_id) + self.to_screen( + '%s: %s URL is invalid, skipping' % (video_id, item)) return False raise @@ -838,7 +846,8 @@ class InfoExtractor(object): def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, entry_protocol='m3u8', preference=None, - m3u8_id=None): + m3u8_id=None, note=None, errnote=None, + fatal=True): formats = [{ 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])), @@ -857,8 +866,11 @@ class InfoExtractor(object): m3u8_doc = self._download_webpage( m3u8_url, video_id, - note='Downloading m3u8 information', - errnote='Failed to download m3u8 information') + note=note or 'Downloading m3u8 information', + errnote=errnote or 'Failed to download m3u8 information', + fatal=fatal) + if m3u8_doc is False: + return m3u8_doc last_info = None last_media = None kv_rex = re.compile( @@ -888,7 +900,7 @@ class InfoExtractor(object): format_id = [] if m3u8_id: format_id.append(m3u8_id) - last_media_name = last_media.get('NAME') if last_media else None + last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats))) f = { 'format_id': '-'.join(format_id), @@ -1064,9 +1076,6 @@ class InfoExtractor(object): def _get_automatic_captions(self, *args, **kwargs): raise NotImplementedError("This method must be implemented by subclasses") - def _subtitles_timecode(self, seconds): - return '%02d:%02d:%02d.%03d' % (seconds / 3600, (seconds % 3600) / 60, seconds % 60, (seconds % 1) * 1000) - class SearchInfoExtractor(InfoExtractor): """