X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=dc508050413c8490882323e01d91ea3a3ba88c9a;hb=3b58d94f71b0e7b3ace6f5965f7335aa95fd0c1e;hp=271bf85968690f00e25252294b51aa0e4cdecb15;hpb=5eb778bf4d52c2d1b53182e48c630a3458f2fb22;p=youtube-dl diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 271bf8596..dc5080504 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -14,10 +14,12 @@ import xml.etree.ElementTree from ..compat import ( compat_cookiejar, + compat_cookies, compat_HTTPError, compat_http_client, compat_urllib_error, compat_urllib_parse_urlparse, + compat_urllib_request, compat_urlparse, compat_str, ) @@ -65,7 +67,7 @@ class InfoExtractor(object): Potential fields: * url Mandatory. The URL of the video file - * ext Will be calculated from url if missing + * ext Will be calculated from URL if missing * format A human-readable description of the format ("mp4 container with h264/opus"). Calculated from the format_id, width, height. @@ -155,7 +157,7 @@ class InfoExtractor(object): lower to higher preference, each element is a dictionary with the "ext" entry and one of: * "data": The subtitles file contents - * "url": A url pointing to the subtitles file + * "url": A URL pointing to the subtitles file automatic_captions: Like 'subtitles', used by the YoutubeIE for automatically generated captions duration: Length of the video in seconds, as an integer. @@ -176,13 +178,18 @@ class InfoExtractor(object): Set to "root" to indicate that this is a comment to the original video. age_limit: Age restriction for the video, as an integer (years) - webpage_url: The url to the video webpage, if given to youtube-dl it + webpage_url: The URL to the video webpage, if given to youtube-dl it should allow to get the same result again. (It will be set by YoutubeDL if it's missing) categories: A list of categories that the video falls in, for example ["Sports", "Berlin"] + tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"] is_live: True, False, or None (=unknown). Whether this video is a live stream that goes on instead of a fixed-length video. + start_time: Time in seconds where the reproduction should start, as + specified in the URL. + end_time: Time in seconds where the reproduction should end, as + specified in the URL. Unless mentioned otherwise, the fields should be Unicode strings. @@ -501,7 +508,7 @@ class InfoExtractor(object): # Methods for following #608 @staticmethod def url_result(url, ie=None, video_id=None, video_title=None): - """Returns a url that points to a page that should be processed""" + """Returns a URL that points to a page that should be processed""" # TODO: ie should be the class used for getting the info video_info = {'_type': 'url', 'url': url, @@ -626,6 +633,12 @@ class InfoExtractor(object): template % (content_re, property_re), ] + @staticmethod + def _meta_regex(prop): + return r'''(?isx)]+(?:itemprop|name|property)=(["\']?)%s\1) + [^>]+?content=(["\'])(?P.*?)\2''' % re.escape(prop) + def _og_search_property(self, prop, html, name=None, **kargs): if name is None: name = 'OpenGraph %s' % prop @@ -635,7 +648,7 @@ class InfoExtractor(object): return unescapeHTML(escaped) def _og_search_thumbnail(self, html, **kargs): - return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs) + return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs) def _og_search_description(self, html, **kargs): return self._og_search_property('description', html, fatal=False, **kargs) @@ -656,9 +669,7 @@ class InfoExtractor(object): if display_name is None: display_name = name return self._html_search_regex( - r'''(?isx)]+(?:itemprop|name|property)=(["\']?)%s\1) - [^>]+?content=(["\'])(?P.*?)\2''' % re.escape(name), + self._meta_regex(name), html, display_name, fatal=fatal, group='content', **kwargs) def _dc_search_uploader(self, html): @@ -836,13 +847,14 @@ class InfoExtractor(object): self.to_screen(msg) time.sleep(timeout) - def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None): + def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None, + transform_source=lambda s: fix_xml_ampersands(s).strip()): manifest = self._download_xml( manifest_url, video_id, 'Downloading f4m manifest', 'Unable to download f4m manifest', # Some manifests may be malformed, e.g. prosiebensat1 generated manifests # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244) - transform_source=lambda s: fix_xml_ampersands(s).strip()) + transform_source=transform_source) formats = [] manifest_version = '1.0' @@ -995,7 +1007,7 @@ class InfoExtractor(object): def _parse_smil_video(self, video, video_id, base, rtmp_count): src = video.get('src') if not src: - return ([], rtmp_count) + return [], rtmp_count bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) width = int_or_none(video.get('width')) height = int_or_none(video.get('height')) @@ -1008,7 +1020,7 @@ class InfoExtractor(object): proto = 'http' ext = video.get('ext') if proto == 'm3u8': - return (self._extract_m3u8_formats(src, video_id, ext), rtmp_count) + return self._extract_m3u8_formats(src, video_id, ext), rtmp_count elif proto == 'rtmp': rtmp_count += 1 streamer = video.get('streamer') or base @@ -1064,6 +1076,12 @@ class InfoExtractor(object): None, '/', True, False, expire_time, '', None, None, None) self._downloader.cookiejar.set_cookie(cookie) + def _get_cookies(self, url): + """ Return a compat_cookies.SimpleCookie with the cookies for the url """ + req = compat_urllib_request.Request(url) + self._downloader.cookiejar.add_cookie_header(req) + return compat_cookies.SimpleCookie(req.get_header('Cookie')) + def get_testcases(self, include_onlymatching=False): t = getattr(self, '_TEST', None) if t: @@ -1115,7 +1133,7 @@ class InfoExtractor(object): class SearchInfoExtractor(InfoExtractor): """ Base class for paged search queries extractors. - They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query} + They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query} Instances should define _SEARCH_KEY and _MAX_RESULTS. """