X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=a9497851cd5ac122c9ba5488e70b512d520d0a3a;hb=51e9094f4a769187b11621c6477b19ad0e2418f9;hp=9c0421de74288088133398828c6f652b71299caf;hpb=6a3828fddd6f6cec28ec8676bcc3918d583db77c;p=youtube-dl diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 9c0421de7..a9497851c 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1186,6 +1186,7 @@ class InfoExtractor(object): http_count = 0 m3u8_count = 0 + src_urls = [] videos = smil.findall(self._xpath_ns('.//video', namespace)) for video in videos: src = video.get('src') @@ -1222,6 +1223,9 @@ class InfoExtractor(object): continue src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src) + if src_url in src_urls: + continue + src_urls.append(src_url) if proto == 'm3u8' or src_ext == 'm3u8': m3u8_formats = self._extract_m3u8_formats( @@ -1267,11 +1271,13 @@ class InfoExtractor(object): return formats def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): + urls = [] subtitles = {} for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))): src = textstream.get('src') - if not src: + if not src or src in urls: continue + urls.append(src) ext = textstream.get('ext') or determine_ext(src) if not ext: type_ = textstream.get('type') @@ -1434,6 +1440,8 @@ class InfoExtractor(object): base_url = mpd_base_url + base_url representation_id = representation_attrib.get('id') lang = representation_attrib.get('lang') + url_el = representation.find(_add_ns('BaseURL')) + filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None) f = { 'format_id': mpd_id or representation_id, 'url': base_url, @@ -1446,6 +1454,7 @@ class InfoExtractor(object): 'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'), 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None, 'format_note': 'DASH %s' % content_type, + 'filesize': filesize, } representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info: