[extractor/common] Skip malformed ISM manifest XMLs while extracting ISM formats...

[youtube-dl] / youtube_dl / extractor / common.py
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index 0889288f0e47fe785ee27584c2e84023de420083..c51a3a07db693f23f9b53620353c3b4bc59957f0 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -67,6 +67,7 @@ from ..utils import (
      sanitized_Request,
      sanitize_filename,
      str_or_none,
+    strip_or_none,
      unescapeHTML,
      unified_strdate,
      unified_timestamp,
@@ -117,7 +118,7 @@ class InfoExtractor(object):
                                         unfragmented media)
                                       - URL of the MPD manifest or base URL
                                         representing the media if MPD manifest
-                                       is parsed froma string (in case of
+                                       is parsed from a string (in case of
                                         fragmented media)
                                     for MSS - URL of the ISM manifest.
                      * manifest_url
@@ -219,7 +220,7 @@ class InfoExtractor(object):
                          * "preference" (optional, int) - quality of the image
                          * "width" (optional, int)
                          * "height" (optional, int)
-                        * "resolution" (optional, string "{width}x{height"},
+                        * "resolution" (optional, string "{width}x{height}",
                                          deprecated)
                          * "filesize" (optional, int)
      thumbnail:      Full URL to a video thumbnail image.
@@ -542,11 +543,11 @@ class InfoExtractor(object):
              raise ExtractorError('An extractor error has occurred.', cause=e)
  
      def __maybe_fake_ip_and_retry(self, countries):
-        if (not self._downloader.params.get('geo_bypass_country', None) and
-                self._GEO_BYPASS and
-                self._downloader.params.get('geo_bypass', True) and
-                not self._x_forwarded_for_ip and
-                countries):
+        if (not self._downloader.params.get('geo_bypass_country', None)
+                and self._GEO_BYPASS
+                and self._downloader.params.get('geo_bypass', True)
+                and not self._x_forwarded_for_ip
+                and countries):
              country_code = random.choice(countries)
              self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
              if self._x_forwarded_for_ip:
@@ -682,8 +683,8 @@ class InfoExtractor(object):
  
      def __check_blocked(self, content):
          first_block = content[:512]
-        if ('<title>Access to this site is blocked</title>' in content and
-                'Websense' in first_block):
+        if ('<title>Access to this site is blocked</title>' in content
+                and 'Websense' in first_block):
              msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
              blocked_iframe = self._html_search_regex(
                  r'<iframe src="([^"]+)"', content,
@@ -701,8 +702,8 @@ class InfoExtractor(object):
              if block_msg:
                  msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
              raise ExtractorError(msg, expected=True)
-        if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
-                'blocklist.rkn.gov.ru' in content):
+        if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content
+                and 'blocklist.rkn.gov.ru' in content):
              raise ExtractorError(
                  'Access to this webpage has been blocked by decision of the Russian government. '
                  'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
@@ -1423,12 +1424,10 @@ class InfoExtractor(object):
          try:
              self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
              return True
-        except ExtractorError as e:
-            if isinstance(e.cause, compat_urllib_error.URLError):
-                self.to_screen(
-                    '%s: %s URL is invalid, skipping' % (video_id, item))
-                return False
-            raise
+        except ExtractorError:
+            self.to_screen(
+                '%s: %s URL is invalid, skipping' % (video_id, item))
+            return False
  
      def http_scheme(self):
          """ Either "http:" or "https:", depending on the user's preferences """
@@ -1456,14 +1455,14 @@ class InfoExtractor(object):
  
      def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
                               transform_source=lambda s: fix_xml_ampersands(s).strip(),
-                             fatal=True, m3u8_id=None):
+                             fatal=True, m3u8_id=None, data=None, headers={}, query={}):
          manifest = self._download_xml(
              manifest_url, video_id, 'Downloading f4m manifest',
              'Unable to download f4m manifest',
              # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
              # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
              transform_source=transform_source,
-            fatal=fatal)
+            fatal=fatal, data=data, headers=headers, query=query)
  
          if manifest is False:
              return []
@@ -1587,12 +1586,13 @@ class InfoExtractor(object):
      def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
                                entry_protocol='m3u8', preference=None,
                                m3u8_id=None, note=None, errnote=None,
-                              fatal=True, live=False):
+                              fatal=True, live=False, data=None, headers={},
+                              query={}):
          res = self._download_webpage_handle(
              m3u8_url, video_id,
              note=note or 'Downloading m3u8 information',
              errnote=errnote or 'Failed to download m3u8 information',
-            fatal=fatal)
+            fatal=fatal, data=data, headers=headers, query=query)
  
          if res is False:
              return []
@@ -1709,8 +1709,8 @@ class InfoExtractor(object):
                  continue
              else:
                  tbr = float_or_none(
-                    last_stream_inf.get('AVERAGE-BANDWIDTH') or
-                    last_stream_inf.get('BANDWIDTH'), scale=1000)
+                    last_stream_inf.get('AVERAGE-BANDWIDTH')
+                    or last_stream_inf.get('BANDWIDTH'), scale=1000)
                  format_id = []
                  if m3u8_id:
                      format_id.append(m3u8_id)
@@ -1766,6 +1766,19 @@ class InfoExtractor(object):
                          # the same GROUP-ID
                          f['acodec'] = 'none'
                  formats.append(f)
+
+                # for DailyMotion
+                progressive_uri = last_stream_inf.get('PROGRESSIVE-URI')
+                if progressive_uri:
+                    http_f = f.copy()
+                    del http_f['manifest_url']
+                    http_f.update({
+                        'format_id': f['format_id'].replace('hls-', 'http-'),
+                        'protocol': 'http',
+                        'url': progressive_uri,
+                    })
+                    formats.append(http_f)
+
                  last_stream_inf = {}
          return formats
  
@@ -2010,15 +2023,17 @@ class InfoExtractor(object):
              })
          return entries
  
-    def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
+    def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}, data=None, headers={}, query={}):
          res = self._download_xml_handle(
              mpd_url, video_id,
              note=note or 'Downloading MPD manifest',
              errnote=errnote or 'Failed to download MPD manifest',
-            fatal=fatal)
+            fatal=fatal, data=data, headers=headers, query=query)
          if res is False:
              return []
          mpd_doc, urlh = res
+        if mpd_doc is None:
+            return []
          mpd_base_url = base_url(urlh.geturl())
  
          return self._parse_mpd_formats(
@@ -2316,15 +2331,17 @@ class InfoExtractor(object):
                          self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
          return formats
  
-    def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
+    def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
          res = self._download_xml_handle(
              ism_url, video_id,
              note=note or 'Downloading ISM manifest',
              errnote=errnote or 'Failed to download ISM manifest',
-            fatal=fatal)
+            fatal=fatal, data=data, headers=headers, query=query)
          if res is False:
              return []
          ism_doc, urlh = res
+        if ism_doc is None:
+            return []
  
          return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
  
@@ -2478,7 +2495,7 @@ class InfoExtractor(object):
                  'subtitles': {},
              }
              media_attributes = extract_attributes(media_tag)
-            src = media_attributes.get('src')
+            src = strip_or_none(media_attributes.get('src'))
              if src:
                  _, formats = _media_formats(src, media_type)
                  media_info['formats'].extend(formats)
@@ -2488,7 +2505,7 @@ class InfoExtractor(object):
                      s_attr = extract_attributes(source_tag)
                      # data-video-src and data-src are non standard but seen
                      # several times in the wild
-                    src = dict_get(s_attr, ('src', 'data-video-src', 'data-src'))
+                    src = strip_or_none(dict_get(s_attr, ('src', 'data-video-src', 'data-src')))
                      if not src:
                          continue
                      f = parse_content_type(s_attr.get('type'))
@@ -2502,8 +2519,8 @@ class InfoExtractor(object):
                              if str_or_none(s_attr.get(lbl))
                          ]
                          width = int_or_none(s_attr.get('width'))
-                        height = (int_or_none(s_attr.get('height')) or
-                                  int_or_none(s_attr.get('res')))
+                        height = (int_or_none(s_attr.get('height'))
+                                  or int_or_none(s_attr.get('res')))
                          if not width or not height:
                              for lbl in labels:
                                  resolution = parse_resolution(lbl)
@@ -2531,7 +2548,7 @@ class InfoExtractor(object):
                      track_attributes = extract_attributes(track_tag)
                      kind = track_attributes.get('kind')
                      if not kind or kind in ('subtitles', 'captions'):
-                        src = track_attributes.get('src')
+                        src = strip_or_none(track_attributes.get('src'))
                          if not src:
                              continue
                          lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
@@ -2688,7 +2705,7 @@ class InfoExtractor(object):
              entry = {
                  'id': this_video_id,
                  'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
-                'description': video_data.get('description'),
+                'description': clean_html(video_data.get('description')),
                  'thumbnail': urljoin(base_url, self._proto_relative_url(video_data.get('image'))),
                  'timestamp': int_or_none(video_data.get('pubdate')),
                  'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
@@ -2815,6 +2832,33 @@ class InfoExtractor(object):
          self._downloader.cookiejar.add_cookie_header(req)
          return compat_cookies.SimpleCookie(req.get_header('Cookie'))
  
+    def _apply_first_set_cookie_header(self, url_handle, cookie):
+        """
+        Apply first Set-Cookie header instead of the last. Experimental.
+
+        Some sites (e.g. [1-3]) may serve two cookies under the same name
+        in Set-Cookie header and expect the first (old) one to be set rather
+        than second (new). However, as of RFC6265 the newer one cookie
+        should be set into cookie store what actually happens.
+        We will workaround this issue by resetting the cookie to
+        the first one manually.
+        1. https://new.vk.com/
+        2. https://github.com/ytdl-org/youtube-dl/issues/9841#issuecomment-227871201
+        3. https://learning.oreilly.com/
+        """
+        for header, cookies in url_handle.headers.items():
+            if header.lower() != 'set-cookie':
+                continue
+            if sys.version_info[0] >= 3:
+                cookies = cookies.encode('iso-8859-1')
+            cookies = cookies.decode('utf-8')
+            cookie_value = re.search(
+                r'%s=(.+?);.*?\b[Dd]omain=(.+?)(?:[,;]|$)' % cookie, cookies)
+            if cookie_value:
+                value, domain = cookie_value.groups()
+                self._set_cookie(domain, cookie, value)
+                break
+
      def get_testcases(self, include_onlymatching=False):
          t = getattr(self, '_TEST', None)
          if t:
@@ -2845,8 +2889,8 @@ class InfoExtractor(object):
          return not any_restricted
  
      def extract_subtitles(self, *args, **kwargs):
-        if (self._downloader.params.get('writesubtitles', False) or
-                self._downloader.params.get('listsubtitles')):
+        if (self._downloader.params.get('writesubtitles', False)
+                or self._downloader.params.get('listsubtitles')):
              return self._get_subtitles(*args, **kwargs)
          return {}
  
@@ -2871,8 +2915,8 @@ class InfoExtractor(object):
          return ret
  
      def extract_automatic_captions(self, *args, **kwargs):
-        if (self._downloader.params.get('writeautomaticsub', False) or
-                self._downloader.params.get('listsubtitles')):
+        if (self._downloader.params.get('writeautomaticsub', False)
+                or self._downloader.params.get('listsubtitles')):
              return self._get_automatic_captions(*args, **kwargs)
          return {}
  
@@ -2880,9 +2924,9 @@ class InfoExtractor(object):
          raise NotImplementedError('This method must be implemented by subclasses')
  
      def mark_watched(self, *args, **kwargs):
-        if (self._downloader.params.get('mark_watched', False) and
-                (self._get_login_info()[0] is not None or
-                    self._downloader.params.get('cookiefile') is not None)):
+        if (self._downloader.params.get('mark_watched', False)
+                and (self._get_login_info()[0] is not None
+                     or self._downloader.params.get('cookiefile') is not None)):
              self._mark_watched(*args, **kwargs)
  
      def _mark_watched(self, *args, **kwargs):