X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=eb9bfa3d15a2c5084fbf67f05a401474ad2f881d;hb=5c2266df4b9aeb7881ed8c026a038e2a25e43734;hp=def6caa0d1eb320a091ecaecc50db58b82d4f28d;hpb=d5d7bdaeb517f389fff5a6557f072f3586e3c440;p=youtube-dl diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index def6caa0d..eb9bfa3d1 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -10,19 +10,18 @@ import re import socket import sys import time -import xml.etree.ElementTree from ..compat import ( compat_cookiejar, compat_cookies, - compat_HTTPError, + compat_getpass, compat_http_client, compat_urllib_error, compat_urllib_parse, compat_urllib_parse_urlparse, - compat_urllib_request, compat_urlparse, compat_str, + compat_etree_fromstring, ) from ..utils import ( NO_DEFAULT, @@ -37,8 +36,12 @@ from ..utils import ( int_or_none, RegexNotFoundError, sanitize_filename, + sanitized_Request, unescapeHTML, + unified_strdate, url_basename, + xpath_text, + xpath_with_ns, ) @@ -149,6 +152,7 @@ class InfoExtractor(object): description: Full video description. uploader: Full name of the video uploader. creator: The main artist who created the video. + release_date: The date (YYYYMMDD) when the video was released. timestamp: UNIX timestamp of the moment the video became available. upload_date: Video upload date (YYYYMMDD). If not explicitly set, calculated from timestamp. @@ -160,12 +164,14 @@ class InfoExtractor(object): with the "ext" entry and one of: * "data": The subtitles file contents * "url": A URL pointing to the subtitles file + "ext" will be calculated from URL if missing automatic_captions: Like 'subtitles', used by the YoutubeIE for automatically generated captions duration: Length of the video in seconds, as an integer. view_count: How many users have watched the video on the platform. like_count: Number of positive ratings of the video dislike_count: Number of negative ratings of the video + repost_count: Number of reposts of the video average_rating: Average rating give by users, the scale used depends on the webpage comment_count: Number of comments on the video comments: A list of comments, each with one or more of the following @@ -202,8 +208,8 @@ class InfoExtractor(object): There must be a key "entries", which is a list, an iterable, or a PagedList object, each element of which is a valid dictionary by this specification. - Additionally, playlists can have "title" and "id" attributes with the same - semantics as videos (see above). + Additionally, playlists can have "title", "description" and "id" attributes + with the same semantics as videos (see above). _type "multi_video" indicates that there are multiple videos that @@ -304,11 +310,11 @@ class InfoExtractor(object): @classmethod def ie_key(cls): """A string for getting the InfoExtractor with get_info_extractor""" - return cls.__name__[:-2] + return compat_str(cls.__name__[:-2]) @property def IE_NAME(self): - return type(self).__name__[:-2] + return compat_str(type(self).__name__[:-2]) def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True): """ Returns the response handle """ @@ -455,7 +461,7 @@ class InfoExtractor(object): return xml_string if transform_source: xml_string = transform_source(xml_string) - return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8')) + return compat_etree_fromstring(xml_string.encode('utf-8')) def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata', @@ -507,6 +513,18 @@ class InfoExtractor(object): """Report attempt to log in.""" self.to_screen('Logging in') + @staticmethod + def raise_login_required(msg='This video is only available for registered users'): + raise ExtractorError( + '%s. Use --username and --password or --netrc to provide account credentials.' % msg, + expected=True) + + @staticmethod + def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'): + raise ExtractorError( + '%s. You might want to use --proxy to workaround.' % msg, + expected=True) + # Methods for following #608 @staticmethod def url_result(url, ie=None, video_id=None, video_title=None): @@ -608,7 +626,7 @@ class InfoExtractor(object): return (username, password) - def _get_tfa_info(self): + def _get_tfa_info(self, note='two-factor verification code'): """ Get the two-factor authentication info TODO - asking the user will be required for sms/phone verify @@ -622,13 +640,14 @@ class InfoExtractor(object): if downloader_params.get('twofactor', None) is not None: return downloader_params['twofactor'] - return None + return compat_getpass('Type %s and press [Return]: ' % note) # Helper functions for extracting OpenGraph info @staticmethod def _og_regexes(prop): - content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')' - property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop) + content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))' + property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)' + % {'prop': re.escape(prop)}) template = r']+?%s[^>]+?%s' return [ template % (property_re, content_re), @@ -638,7 +657,7 @@ class InfoExtractor(object): @staticmethod def _meta_regex(prop): return r'''(?isx)]+(?:itemprop|name|property|id)=(["\']?)%s\1) + (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1) [^>]+?content=(["\'])(?P.*?)\2''' % re.escape(prop) def _og_search_property(self, prop, html, name=None, **kargs): @@ -722,20 +741,23 @@ class InfoExtractor(object): @staticmethod def _hidden_inputs(html): - return dict([ - (input.group('name'), input.group('value')) for input in re.finditer( - r'''(?x) - ["\'])hidden(?P=q_hidden)\s+ - name=(?P["\'])(?P.+?)(?P=q_name)\s+ - (?:id=(?P["\']).+?(?P=q_id)\s+)? - value=(?P["\'])(?P.*?)(?P=q_value) - ''', html) - ]) + html = re.sub(r'', '', html) + hidden_inputs = {} + for input in re.findall(r'(?i)]+)>', html): + if not re.search(r'type=(["\'])(?:hidden|submit)\1', input): + continue + name = re.search(r'name=(["\'])(?P.+?)\1', input) + if not name: + continue + value = re.search(r'value=(["\'])(?P.*?)\1', input) + if not value: + continue + hidden_inputs[name.group('value')] = value.group('value') + return hidden_inputs def _form_hidden_inputs(self, form_id, html): form = self._search_regex( - r'(?s)]+?id=(["\'])%s\1[^>]*>(?P
.+?)
' % form_id, + r'(?is)]+?id=(["\'])%s\1[^>]*>(?P
.+?)
' % form_id, html, '%s form' % form_id, group='form') return self._hidden_inputs(form) @@ -819,7 +841,7 @@ class InfoExtractor(object): self._request_webpage(url, video_id, 'Checking %s URL' % item) return True except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError): + if isinstance(e.cause, compat_urllib_error.URLError): self.to_screen( '%s: %s URL is invalid, skipping' % (video_id, item)) return False @@ -850,13 +872,18 @@ class InfoExtractor(object): time.sleep(timeout) def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None, - transform_source=lambda s: fix_xml_ampersands(s).strip()): + transform_source=lambda s: fix_xml_ampersands(s).strip(), + fatal=True): manifest = self._download_xml( manifest_url, video_id, 'Downloading f4m manifest', 'Unable to download f4m manifest', # Some manifests may be malformed, e.g. prosiebensat1 generated manifests # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244) - transform_source=transform_source) + transform_source=transform_source, + fatal=fatal) + + if manifest is False: + return manifest formats = [] manifest_version = '1.0' @@ -864,6 +891,11 @@ class InfoExtractor(object): if not media_nodes: manifest_version = '2.0' media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media') + base_url = xpath_text( + manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'], + 'base URL', default=None) + if base_url: + base_url = base_url.strip() for i, media_el in enumerate(media_nodes): if manifest_version == '2.0': media_url = media_el.attrib.get('href') or media_el.attrib.get('url') @@ -871,13 +903,16 @@ class InfoExtractor(object): continue manifest_url = ( media_url if media_url.startswith('http://') or media_url.startswith('https://') - else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url)) + else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url)) # If media_url is itself a f4m manifest do the recursive extraction # since bitrates in parent manifest (this one) and media_url manifest # may differ leading to inability to resolve the format by requested # bitrate in f4m downloader if determine_ext(manifest_url) == 'f4m': - formats.extend(self._extract_f4m_formats(manifest_url, video_id, preference, f4m_id)) + f4m_formats = self._extract_f4m_formats( + manifest_url, video_id, preference, f4m_id, fatal=fatal) + if f4m_formats: + formats.extend(f4m_formats) continue tbr = int_or_none(media_el.attrib.get('bitrate')) formats.append({ @@ -913,13 +948,15 @@ class InfoExtractor(object): if re.match(r'^https?://', u) else compat_urlparse.urljoin(m3u8_url, u)) - m3u8_doc = self._download_webpage( + res = self._download_webpage_handle( m3u8_url, video_id, note=note or 'Downloading m3u8 information', errnote=errnote or 'Failed to download m3u8 information', fatal=fatal) - if m3u8_doc is False: - return m3u8_doc + if res is False: + return res + m3u8_doc, urlh = res + m3u8_url = urlh.geturl() last_info = None last_media = None kv_rex = re.compile( @@ -1025,6 +1062,7 @@ class InfoExtractor(object): video_id = os.path.splitext(url_basename(smil_url))[0] title = None description = None + upload_date = None for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): name = meta.attrib.get('name') content = meta.attrib.get('content') @@ -1034,11 +1072,22 @@ class InfoExtractor(object): title = content elif not description and name in ('description', 'abstract'): description = content + elif not upload_date and name == 'date': + upload_date = unified_strdate(content) + + thumbnails = [{ + 'id': image.get('type'), + 'url': image.get('src'), + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')] return { 'id': video_id, 'title': title or video_id, 'description': description, + 'upload_date': upload_date, + 'thumbnails': thumbnails, 'formats': formats, 'subtitles': subtitles, } @@ -1047,7 +1096,7 @@ class InfoExtractor(object): return self._search_regex( r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None) - def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None): + def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): base = smil_url for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): b = meta.get('base') or meta.get('httpBase') @@ -1065,7 +1114,7 @@ class InfoExtractor(object): if not src: continue - bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) + bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) filesize = int_or_none(video.get('size') or video.get('fileSize')) width = int_or_none(video.get('width')) height = int_or_none(video.get('height')) @@ -1086,13 +1135,21 @@ class InfoExtractor(object): 'width': width, 'height': height, }) + if transform_rtmp_url: + streamer, src = transform_rtmp_url(streamer, src) + formats[-1].update({ + 'url': streamer, + 'play_path': src, + }) continue src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src) if proto == 'm3u8' or src_ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - src_url, video_id, ext or 'mp4', m3u8_id='hls')) + m3u8_formats = self._extract_m3u8_formats( + src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) continue if src_ext == 'f4m': @@ -1104,10 +1161,12 @@ class InfoExtractor(object): } f4m_url += '&' if '?' in f4m_url else '?' f4m_url += compat_urllib_parse.urlencode(f4m_params) - formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds')) + f4m_formats = self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) continue - if src_url.startswith('http'): + if src_url.startswith('http') and self._is_valid_url(src, video_id): http_count += 1 formats.append({ 'url': src_url, @@ -1124,7 +1183,7 @@ class InfoExtractor(object): return formats - def _parse_smil_subtitles(self, smil, namespace=None): + def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): subtitles = {} for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))): src = textstream.get('src') @@ -1133,15 +1192,63 @@ class InfoExtractor(object): ext = textstream.get('ext') or determine_ext(src) if not ext: type_ = textstream.get('type') - if type_ == 'text/srt': - ext = 'srt' - lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') + SUBTITLES_TYPES = { + 'text/vtt': 'vtt', + 'text/srt': 'srt', + 'application/smptett+xml': 'tt', + } + if type_ in SUBTITLES_TYPES: + ext = SUBTITLES_TYPES[type_] + lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang subtitles.setdefault(lang, []).append({ 'url': src, 'ext': ext, }) return subtitles + def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True): + xspf = self._download_xml( + playlist_url, playlist_id, 'Downloading xpsf playlist', + 'Unable to download xspf manifest', fatal=fatal) + if xspf is False: + return [] + return self._parse_xspf(xspf, playlist_id) + + def _parse_xspf(self, playlist, playlist_id): + NS_MAP = { + 'xspf': 'http://xspf.org/ns/0/', + 's1': 'http://static.streamone.nl/player/ns/0', + } + + entries = [] + for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)): + title = xpath_text( + track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id) + description = xpath_text( + track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description') + thumbnail = xpath_text( + track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail') + duration = float_or_none( + xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000) + + formats = [{ + 'url': location.text, + 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)), + 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))), + 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))), + } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))] + self._sort_formats(formats) + + entries.append({ + 'id': playlist_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + }) + return entries + def _live_title(self, name): """ Generate the title for a live video """ now = datetime.datetime.now() @@ -1178,7 +1285,7 @@ class InfoExtractor(object): def _get_cookies(self, url): """ Return a compat_cookies.SimpleCookie with the cookies for the url """ - req = compat_urllib_request.Request(url) + req = sanitized_Request(url) self._downloader.cookiejar.add_cookie_header(req) return compat_cookies.SimpleCookie(req.get_header('Cookie')) @@ -1220,6 +1327,23 @@ class InfoExtractor(object): def _get_subtitles(self, *args, **kwargs): raise NotImplementedError("This method must be implemented by subclasses") + @staticmethod + def _merge_subtitle_items(subtitle_list1, subtitle_list2): + """ Merge subtitle items for one language. Items with duplicated URLs + will be dropped. """ + list1_urls = set([item['url'] for item in subtitle_list1]) + ret = list(subtitle_list1) + ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls]) + return ret + + @classmethod + def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2): + """ Merge two subtitle dictionaries, language by language. """ + ret = dict(subtitle_dict1) + for lang in subtitle_dict2: + ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang]) + return ret + def extract_automatic_captions(self, *args, **kwargs): if (self._downloader.params.get('writeautomaticsub', False) or self._downloader.params.get('listsubtitles')):