X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fcommon.py;h=c51a3a07db693f23f9b53620353c3b4bc59957f0;hb=13b08034b53efdcf7055df92199a0f35cf1e172e;hp=f994953bc7b4b4bbbb9dd2bc073dbef0f2643bdd;hpb=e3c1266f492d710e2acbf0d80f44f7f805eb5187;p=youtube-dl diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index f994953bc..c51a3a07d 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -67,6 +67,7 @@ from ..utils import ( sanitized_Request, sanitize_filename, str_or_none, + strip_or_none, unescapeHTML, unified_strdate, unified_timestamp, @@ -219,7 +220,7 @@ class InfoExtractor(object): * "preference" (optional, int) - quality of the image * "width" (optional, int) * "height" (optional, int) - * "resolution" (optional, string "{width}x{height"}, + * "resolution" (optional, string "{width}x{height}", deprecated) * "filesize" (optional, int) thumbnail: Full URL to a video thumbnail image. @@ -1423,12 +1424,10 @@ class InfoExtractor(object): try: self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers) return True - except ExtractorError as e: - if isinstance(e.cause, compat_urllib_error.URLError): - self.to_screen( - '%s: %s URL is invalid, skipping' % (video_id, item)) - return False - raise + except ExtractorError: + self.to_screen( + '%s: %s URL is invalid, skipping' % (video_id, item)) + return False def http_scheme(self): """ Either "http:" or "https:", depending on the user's preferences """ @@ -1456,14 +1455,14 @@ class InfoExtractor(object): def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None, transform_source=lambda s: fix_xml_ampersands(s).strip(), - fatal=True, m3u8_id=None): + fatal=True, m3u8_id=None, data=None, headers={}, query={}): manifest = self._download_xml( manifest_url, video_id, 'Downloading f4m manifest', 'Unable to download f4m manifest', # Some manifests may be malformed, e.g. prosiebensat1 generated manifests # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244) transform_source=transform_source, - fatal=fatal) + fatal=fatal, data=data, headers=headers, query=query) if manifest is False: return [] @@ -1587,12 +1586,13 @@ class InfoExtractor(object): def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, entry_protocol='m3u8', preference=None, m3u8_id=None, note=None, errnote=None, - fatal=True, live=False): + fatal=True, live=False, data=None, headers={}, + query={}): res = self._download_webpage_handle( m3u8_url, video_id, note=note or 'Downloading m3u8 information', errnote=errnote or 'Failed to download m3u8 information', - fatal=fatal) + fatal=fatal, data=data, headers=headers, query=query) if res is False: return [] @@ -1766,6 +1766,19 @@ class InfoExtractor(object): # the same GROUP-ID f['acodec'] = 'none' formats.append(f) + + # for DailyMotion + progressive_uri = last_stream_inf.get('PROGRESSIVE-URI') + if progressive_uri: + http_f = f.copy() + del http_f['manifest_url'] + http_f.update({ + 'format_id': f['format_id'].replace('hls-', 'http-'), + 'protocol': 'http', + 'url': progressive_uri, + }) + formats.append(http_f) + last_stream_inf = {} return formats @@ -2010,12 +2023,12 @@ class InfoExtractor(object): }) return entries - def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}): + def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}, data=None, headers={}, query={}): res = self._download_xml_handle( mpd_url, video_id, note=note or 'Downloading MPD manifest', errnote=errnote or 'Failed to download MPD manifest', - fatal=fatal) + fatal=fatal, data=data, headers=headers, query=query) if res is False: return [] mpd_doc, urlh = res @@ -2318,15 +2331,17 @@ class InfoExtractor(object): self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) return formats - def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True): + def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}): res = self._download_xml_handle( ism_url, video_id, note=note or 'Downloading ISM manifest', errnote=errnote or 'Failed to download ISM manifest', - fatal=fatal) + fatal=fatal, data=data, headers=headers, query=query) if res is False: return [] ism_doc, urlh = res + if ism_doc is None: + return [] return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id) @@ -2480,7 +2495,7 @@ class InfoExtractor(object): 'subtitles': {}, } media_attributes = extract_attributes(media_tag) - src = media_attributes.get('src') + src = strip_or_none(media_attributes.get('src')) if src: _, formats = _media_formats(src, media_type) media_info['formats'].extend(formats) @@ -2490,7 +2505,7 @@ class InfoExtractor(object): s_attr = extract_attributes(source_tag) # data-video-src and data-src are non standard but seen # several times in the wild - src = dict_get(s_attr, ('src', 'data-video-src', 'data-src')) + src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src'))) if not src: continue f = parse_content_type(s_attr.get('type')) @@ -2533,7 +2548,7 @@ class InfoExtractor(object): track_attributes = extract_attributes(track_tag) kind = track_attributes.get('kind') if not kind or kind in ('subtitles', 'captions'): - src = track_attributes.get('src') + src = strip_or_none(track_attributes.get('src')) if not src: continue lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label') @@ -2690,7 +2705,7 @@ class InfoExtractor(object): entry = { 'id': this_video_id, 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')), - 'description': video_data.get('description'), + 'description': clean_html(video_data.get('description')), 'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))), 'timestamp': int_or_none(video_data.get('pubdate')), 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')), @@ -2818,15 +2833,19 @@ class InfoExtractor(object): return compat_cookies.SimpleCookie(req.get_header('Cookie')) def _apply_first_set_cookie_header(self, url_handle, cookie): - # Some sites (e.g. [1-3]) may serve two cookies under the same name - # in Set-Cookie header and expect the first (old) one to be set rather - # than second (new). However, as of RFC6265 the newer one cookie - # should be set into cookie store what actually happens. - # We will workaround this issue by resetting the cookie to - # the first one manually. - # 1. https://new.vk.com/ - # 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201 - # 3. https://learning.oreilly.com/ + """ + Apply first Set-Cookie header instead of the last. Experimental. + + Some sites (e.g. [1-3]) may serve two cookies under the same name + in Set-Cookie header and expect the first (old) one to be set rather + than second (new). However, as of RFC6265 the newer one cookie + should be set into cookie store what actually happens. + We will workaround this issue by resetting the cookie to + the first one manually. + 1. https://new.vk.com/ + 2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201 + 3. https://learning.oreilly.com/ + """ for header, cookies in url_handle.headers.items(): if header.lower() != 'set-cookie': continue