X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=c3b0586a0da65ce42ce5f3ca6aee2e7119d3d98f;hb=22f5f5c6fcd2d7f0c9f1ff3019fe0b957b771f44;hp=f3fec160dcd0795840725227edcbfbfe3f3673c3;hpb=5e8e2fa51f416e227367211ab937dfea17f89f57;p=youtube-dl diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index f3fec160d..c3b0586a0 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -52,6 +52,7 @@ from ..utils import ( GeoUtils, int_or_none, js_to_json, + JSON_LD_RE, mimetype2ext, orderedSet, parse_codecs, @@ -68,6 +69,7 @@ from ..utils import ( update_url_query, urljoin, url_basename, + url_or_none, xpath_element, xpath_text, xpath_with_ns, @@ -210,6 +212,11 @@ class InfoExtractor(object): If not explicitly set, calculated from timestamp. uploader_id: Nickname or id of the video uploader. uploader_url: Full URL to a personal webpage of the video uploader. + channel: Full name of the channel the video is uploaded on. + Note that channel fields may or may not repeat uploader + fields. This depends on a particular extractor. + channel_id: Id of the channel. + channel_url: Full URL to a channel webpage. location: Physical location where the video was filmed. subtitles: The available subtitles as a dictionary in the format {tag: subformats}. "tag" is usually a language code, and @@ -599,6 +606,11 @@ class InfoExtractor(object): except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: if isinstance(err, compat_urllib_error.HTTPError): if self.__can_accept_status_code(err, expected_status): + # Retain reference to error to prevent file object from + # being closed before it can be read. Works around the + # effects of + # introduced in Python 3.4.1. + err.fp._error = err return err.fp if errnote is False: @@ -1046,7 +1058,7 @@ class InfoExtractor(object): @staticmethod def _og_regexes(prop): content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))' - property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)' + property_re = (r'(?:name|property)=(?:\'og[:-]%(prop)s\'|"og[:-]%(prop)s"|\s*og[:-]%(prop)s\b)' % {'prop': re.escape(prop)}) template = r']+?%s[^>]+?%s' return [ @@ -1149,8 +1161,7 @@ class InfoExtractor(object): def _search_json_ld(self, html, video_id, expected_type=None, **kwargs): json_ld = self._search_regex( - r'(?s)]+type=(["\'])application/ld\+json\1[^>]*>(?P.+?)', - html, 'JSON-LD', group='json_ld', **kwargs) + JSON_LD_RE, html, 'JSON-LD', group='json_ld', **kwargs) default = kwargs.get('default', NO_DEFAULT) if not json_ld: return default if default is not NO_DEFAULT else {} @@ -1208,10 +1219,10 @@ class InfoExtractor(object): def extract_video_object(e): assert e['@type'] == 'VideoObject' info.update({ - 'url': e.get('contentUrl'), + 'url': url_or_none(e.get('contentUrl')), 'title': unescapeHTML(e.get('name')), 'description': unescapeHTML(e.get('description')), - 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'), + 'thumbnail': url_or_none(e.get('thumbnailUrl') or e.get('thumbnailURL')), 'duration': parse_duration(e.get('duration')), 'timestamp': unified_timestamp(e.get('uploadDate')), 'filesize': float_or_none(e.get('contentSize')), @@ -1228,17 +1239,30 @@ class InfoExtractor(object): if expected_type is not None and expected_type != item_type: return info if item_type in ('TVEpisode', 'Episode'): + episode_name = unescapeHTML(e.get('name')) info.update({ - 'episode': unescapeHTML(e.get('name')), + 'episode': episode_name, 'episode_number': int_or_none(e.get('episodeNumber')), 'description': unescapeHTML(e.get('description')), }) + if not info.get('title') and episode_name: + info['title'] = episode_name part_of_season = e.get('partOfSeason') if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'): - info['season_number'] = int_or_none(part_of_season.get('seasonNumber')) + info.update({ + 'season': unescapeHTML(part_of_season.get('name')), + 'season_number': int_or_none(part_of_season.get('seasonNumber')), + }) part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries') if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'): info['series'] = unescapeHTML(part_of_series.get('name')) + elif item_type == 'Movie': + info.update({ + 'title': unescapeHTML(e.get('name')), + 'description': unescapeHTML(e.get('description')), + 'duration': parse_duration(e.get('duration')), + 'timestamp': unified_timestamp(e.get('dateCreated')), + }) elif item_type in ('Article', 'NewsArticle'): info.update({ 'timestamp': parse_iso8601(e.get('datePublished')), @@ -1575,6 +1599,7 @@ class InfoExtractor(object): # References: # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21 # 2. https://github.com/rg3/youtube-dl/issues/12211 + # 3. https://github.com/rg3/youtube-dl/issues/18923 # We should try extracting formats only from master playlists [1, 4.3.4], # i.e. playlists that describe available qualities. On the other hand @@ -1646,11 +1671,16 @@ class InfoExtractor(object): rendition = stream_group[0] return rendition.get('NAME') or stream_group_id + # parse EXT-X-MEDIA tags before EXT-X-STREAM-INF in order to have the + # chance to detect video only formats when EXT-X-STREAM-INF tags + # precede EXT-X-MEDIA tags in HLS manifest such as [3]. + for line in m3u8_doc.splitlines(): + if line.startswith('#EXT-X-MEDIA:'): + extract_media(line) + for line in m3u8_doc.splitlines(): if line.startswith('#EXT-X-STREAM-INF:'): last_stream_inf = parse_m3u8_attributes(line) - elif line.startswith('#EXT-X-MEDIA:'): - extract_media(line) elif line.startswith('#') or not line.strip(): continue else: @@ -1701,9 +1731,9 @@ class InfoExtractor(object): # However, this is not always respected, for example, [2] # contains EXT-X-STREAM-INF tag which references AUDIO # rendition group but does not have CODECS and despite - # referencing audio group an audio group, it represents - # a complete (with audio and video) format. So, for such cases - # we will ignore references to rendition groups and treat them + # referencing an audio group it represents a complete + # (with audio and video) format. So, for such cases we will + # ignore references to rendition groups and treat them # as complete formats. if audio_group_id and codecs and f.get('vcodec') != 'none': audio_group = groups.get(audio_group_id) @@ -1859,9 +1889,7 @@ class InfoExtractor(object): 'height': height, }) formats.extend(m3u8_formats) - continue - - if src_ext == 'f4m': + elif src_ext == 'f4m': f4m_url = src_url if not f4m_params: f4m_params = { @@ -1871,9 +1899,13 @@ class InfoExtractor(object): f4m_url += '&' if '?' in f4m_url else '?' f4m_url += compat_urllib_parse_urlencode(f4m_params) formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)) - continue - - if src_url.startswith('http') and self._is_valid_url(src, video_id): + elif src_ext == 'mpd': + formats.extend(self._extract_mpd_formats( + src_url, video_id, mpd_id='dash', fatal=False)) + elif re.search(r'\.ism/[Mm]anifest', src_url): + formats.extend(self._extract_ism_formats( + src_url, video_id, ism_id='mss', fatal=False)) + elif src_url.startswith('http') and self._is_valid_url(src, video_id): http_count += 1 formats.append({ 'url': src_url, @@ -1884,7 +1916,6 @@ class InfoExtractor(object): 'width': width, 'height': height, }) - continue return formats @@ -2106,7 +2137,21 @@ class InfoExtractor(object): representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) def prepare_template(template_name, identifiers): - t = representation_ms_info[template_name] + tmpl = representation_ms_info[template_name] + # First of, % characters outside $...$ templates + # must be escaped by doubling for proper processing + # by % operator string formatting used further (see + # https://github.com/rg3/youtube-dl/issues/16867). + t = '' + in_template = False + for c in tmpl: + t += c + if c == '$': + in_template = not in_template + elif c == '%' and not in_template: + t += c + # Next, $...$ templates are translated to their + # %(...) counterparts to be used with % operator t = t.replace('$RepresentationID$', representation_id) t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t) t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t) @@ -2588,7 +2633,7 @@ class InfoExtractor(object): 'id': this_video_id, 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')), 'description': video_data.get('description'), - 'thumbnail': self._proto_relative_url(video_data.get('image')), + 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))), 'timestamp': int_or_none(video_data.get('pubdate')), 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')), 'subtitles': subtitles, @@ -2615,12 +2660,9 @@ class InfoExtractor(object): for source in jwplayer_sources_data: if not isinstance(source, dict): continue - source_url = self._proto_relative_url(source.get('file')) - if not source_url: - continue - if base_url: - source_url = compat_urlparse.urljoin(base_url, source_url) - if source_url in urls: + source_url = urljoin( + base_url, self._proto_relative_url(source.get('file'))) + if not source_url or source_url in urls: continue urls.append(source_url) source_type = source.get('type') or ''