X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=dbae75406233ae8d5d2db2acf2090e6522a91295;hb=9d5332518c51fb0df69b844f9258e61bd7ecd390;hp=16ae4b98ffe09c97f604981bf6c2ce9dc1e44e03;hpb=8b9848ac5678356757f67a412f7ed89a0f559be7;p=youtube-dl diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 16ae4b98f..dbae75406 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -15,6 +15,7 @@ import xml.etree.ElementTree from ..compat import ( compat_cookiejar, compat_cookies, + compat_getpass, compat_HTTPError, compat_http_client, compat_urllib_error, @@ -38,6 +39,7 @@ from ..utils import ( RegexNotFoundError, sanitize_filename, unescapeHTML, + unified_strdate, url_basename, xpath_text, xpath_with_ns, @@ -151,6 +153,7 @@ class InfoExtractor(object): description: Full video description. uploader: Full name of the video uploader. creator: The main artist who created the video. + release_date: The date (YYYYMMDD) when the video was released. timestamp: UNIX timestamp of the moment the video became available. upload_date: Video upload date (YYYYMMDD). If not explicitly set, calculated from timestamp. @@ -162,6 +165,7 @@ class InfoExtractor(object): with the "ext" entry and one of: * "data": The subtitles file contents * "url": A URL pointing to the subtitles file + "ext" will be calculated from URL if missing automatic_captions: Like 'subtitles', used by the YoutubeIE for automatically generated captions duration: Length of the video in seconds, as an integer. @@ -509,6 +513,18 @@ class InfoExtractor(object): """Report attempt to log in.""" self.to_screen('Logging in') + @staticmethod + def raise_login_required(msg='This video is only available for registered users'): + raise ExtractorError( + '%s. Use --username and --password or --netrc to provide account credentials.' % msg, + expected=True) + + @staticmethod + def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'): + raise ExtractorError( + '%s. You might want to use --proxy to workaround.' % msg, + expected=True) + # Methods for following #608 @staticmethod def url_result(url, ie=None, video_id=None, video_title=None): @@ -610,7 +626,7 @@ class InfoExtractor(object): return (username, password) - def _get_tfa_info(self): + def _get_tfa_info(self, note='two-factor verification code'): """ Get the two-factor authentication info TODO - asking the user will be required for sms/phone verify @@ -624,7 +640,7 @@ class InfoExtractor(object): if downloader_params.get('twofactor', None) is not None: return downloader_params['twofactor'] - return None + return compat_getpass('Type %s and press [Return]: ' % note) # Helper functions for extracting OpenGraph info @staticmethod @@ -724,20 +740,23 @@ class InfoExtractor(object): @staticmethod def _hidden_inputs(html): - return dict([ - (input.group('name'), input.group('value')) for input in re.finditer( - r'''(?x) - ["\'])hidden(?P=q_hidden)\s+ - name=(?P["\'])(?P.+?)(?P=q_name)\s+ - (?:id=(?P["\']).+?(?P=q_id)\s+)? - value=(?P["\'])(?P.*?)(?P=q_value) - ''', html) - ]) + html = re.sub(r'', '', html) + hidden_inputs = {} + for input in re.findall(r'(?i)]+)>', html): + if not re.search(r'type=(["\'])(?:hidden|submit)\1', input): + continue + name = re.search(r'name=(["\'])(?P.+?)\1', input) + if not name: + continue + value = re.search(r'value=(["\'])(?P.*?)\1', input) + if not value: + continue + hidden_inputs[name.group('value')] = value.group('value') + return hidden_inputs def _form_hidden_inputs(self, form_id, html): form = self._search_regex( - r'(?s)]+?id=(["\'])%s\1[^>]*>(?P
.+?)
' % form_id, + r'(?is)]+?id=(["\'])%s\1[^>]*>(?P
.+?)
' % form_id, html, '%s form' % form_id, group='form') return self._hidden_inputs(form) @@ -852,13 +871,18 @@ class InfoExtractor(object): time.sleep(timeout) def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None, - transform_source=lambda s: fix_xml_ampersands(s).strip()): + transform_source=lambda s: fix_xml_ampersands(s).strip(), + fatal=True): manifest = self._download_xml( manifest_url, video_id, 'Downloading f4m manifest', 'Unable to download f4m manifest', # Some manifests may be malformed, e.g. prosiebensat1 generated manifests # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244) - transform_source=transform_source) + transform_source=transform_source, + fatal=fatal) + + if manifest is False: + return manifest formats = [] manifest_version = '1.0' @@ -879,7 +903,10 @@ class InfoExtractor(object): # may differ leading to inability to resolve the format by requested # bitrate in f4m downloader if determine_ext(manifest_url) == 'f4m': - formats.extend(self._extract_f4m_formats(manifest_url, video_id, preference, f4m_id)) + f4m_formats = self._extract_f4m_formats( + manifest_url, video_id, preference, f4m_id, fatal=fatal) + if f4m_formats: + formats.extend(f4m_formats) continue tbr = int_or_none(media_el.attrib.get('bitrate')) formats.append({ @@ -1027,6 +1054,7 @@ class InfoExtractor(object): video_id = os.path.splitext(url_basename(smil_url))[0] title = None description = None + upload_date = None for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): name = meta.attrib.get('name') content = meta.attrib.get('content') @@ -1036,11 +1064,22 @@ class InfoExtractor(object): title = content elif not description and name in ('description', 'abstract'): description = content + elif not upload_date and name == 'date': + upload_date = unified_strdate(content) + + thumbnails = [{ + 'id': image.get('type'), + 'url': image.get('src'), + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')] return { 'id': video_id, 'title': title or video_id, 'description': description, + 'upload_date': upload_date, + 'thumbnails': thumbnails, 'formats': formats, 'subtitles': subtitles, } @@ -1049,7 +1088,7 @@ class InfoExtractor(object): return self._search_regex( r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None) - def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None): + def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): base = smil_url for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): b = meta.get('base') or meta.get('httpBase') @@ -1067,7 +1106,7 @@ class InfoExtractor(object): if not src: continue - bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) + bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) filesize = int_or_none(video.get('size') or video.get('fileSize')) width = int_or_none(video.get('width')) height = int_or_none(video.get('height')) @@ -1088,13 +1127,21 @@ class InfoExtractor(object): 'width': width, 'height': height, }) + if transform_rtmp_url: + streamer, src = transform_rtmp_url(streamer, src) + formats[-1].update({ + 'url': streamer, + 'play_path': src, + }) continue src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src) if proto == 'm3u8' or src_ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - src_url, video_id, ext or 'mp4', m3u8_id='hls')) + m3u8_formats = self._extract_m3u8_formats( + src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) continue if src_ext == 'f4m': @@ -1106,10 +1153,12 @@ class InfoExtractor(object): } f4m_url += '&' if '?' in f4m_url else '?' f4m_url += compat_urllib_parse.urlencode(f4m_params) - formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds')) + f4m_formats = self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) continue - if src_url.startswith('http'): + if src_url.startswith('http') and self._is_valid_url(src, video_id): http_count += 1 formats.append({ 'url': src_url, @@ -1126,7 +1175,7 @@ class InfoExtractor(object): return formats - def _parse_smil_subtitles(self, smil, namespace=None): + def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): subtitles = {} for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))): src = textstream.get('src') @@ -1135,9 +1184,14 @@ class InfoExtractor(object): ext = textstream.get('ext') or determine_ext(src) if not ext: type_ = textstream.get('type') - if type_ == 'text/srt': - ext = 'srt' - lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') + SUBTITLES_TYPES = { + 'text/vtt': 'vtt', + 'text/srt': 'srt', + 'application/smptett+xml': 'tt', + } + if type_ in SUBTITLES_TYPES: + ext = SUBTITLES_TYPES[type_] + lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang subtitles.setdefault(lang, []).append({ 'url': src, 'ext': ext, @@ -1265,6 +1319,23 @@ class InfoExtractor(object): def _get_subtitles(self, *args, **kwargs): raise NotImplementedError("This method must be implemented by subclasses") + @staticmethod + def _merge_subtitle_items(subtitle_list1, subtitle_list2): + """ Merge subtitle items for one language. Items with duplicated URLs + will be dropped. """ + list1_urls = set([item['url'] for item in subtitle_list1]) + ret = list(subtitle_list1) + ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls]) + return ret + + @classmethod + def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2): + """ Merge two subtitle dictionaries, language by language. """ + ret = dict(subtitle_dict1) + for lang in subtitle_dict2: + ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang]) + return ret + def extract_automatic_captions(self, *args, **kwargs): if (self._downloader.params.get('writeautomaticsub', False) or self._downloader.params.get('listsubtitles')):