Merge branch 'akamai_pv' of https://github.com/remitamine/youtube-dl into remitamine...

[youtube-dl] / youtube_dl / extractor / common.py
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index aaca25a12a872eabfe3f43734e0599e7e3d4d78e..a285ee7d898e0777005ff625b5fcc44f1013d683 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -21,9 +21,11 @@ from ..compat import (
      compat_os_name,
      compat_str,
      compat_urllib_error,
-    compat_urllib_parse,
+    compat_urllib_parse_urlencode,
+    compat_urllib_request,
      compat_urlparse,
  )
+from ..downloader.f4m import remove_encrypted_media
  from ..utils import (
      NO_DEFAULT,
      age_restricted,
@@ -48,6 +50,7 @@ from ..utils import (
      determine_protocol,
      parse_duration,
      mimetype2ext,
+    update_Request,
      update_url_query,
  )
  
@@ -229,6 +232,24 @@ class InfoExtractor(object):
      episode_number: Number of the video episode within a season, as an integer.
      episode_id:     Id of the video episode, as a unicode string.
  
+    The following fields should only be used when the media is a track or a part of
+    a music album:
+
+    track:          Title of the track.
+    track_number:   Number of the track within an album or a disc, as an integer.
+    track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
+                    as a unicode string.
+    artist:         Artist(s) of the track.
+    genre:          Genre(s) of the track.
+    album:          Title of the album the track belongs to.
+    album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
+    album_artist:   List of all artists appeared on the album (e.g.
+                    "Ash Borer / Fell Voices" or "Various Artists", useful for splits
+                    and compilations).
+    disc_number:    Number of the disc or other physical medium the track belongs to,
+                    as an integer.
+    release_year:   Year (YYYY) when the album was released.
+
      Unless mentioned otherwise, the fields should be Unicode strings.
  
      Unless mentioned otherwise, None is equivalent to absence of information.
@@ -346,7 +367,7 @@ class InfoExtractor(object):
      def IE_NAME(self):
          return compat_str(type(self).__name__[:-2])
  
-    def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None):
+    def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
          """ Returns the response handle """
          if note is None:
              self.report_download_webpage(video_id)
@@ -355,12 +376,14 @@ class InfoExtractor(object):
                  self.to_screen('%s' % (note,))
              else:
                  self.to_screen('%s: %s' % (video_id, note))
-        # data, headers and query params will be ignored for `Request` objects
-        if isinstance(url_or_request, compat_str):
+        if isinstance(url_or_request, compat_urllib_request.Request):
+            url_or_request = update_Request(
+                url_or_request, data=data, headers=headers, query=query)
+        else:
              if query:
                  url_or_request = update_url_query(url_or_request, query)
-            if data or headers:
-                url_or_request = sanitized_Request(url_or_request, data, headers or {})
+            if data is not None or headers:
+                url_or_request = sanitized_Request(url_or_request, data, headers)
          try:
              return self._downloader.urlopen(url_or_request)
          except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
@@ -376,7 +399,7 @@ class InfoExtractor(object):
                  self._downloader.report_warning(errmsg)
                  return False
  
-    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers=None, query=None):
+    def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
          """ Returns a tuple (page content as string, URL handle) """
          # Strip hashes from the URL (#1038)
          if isinstance(url_or_request, (compat_str, str)):
@@ -469,7 +492,7 @@ class InfoExtractor(object):
  
          return content
  
-    def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers=None, query=None):
+    def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
          """ Returns the data of the page as a string """
          success = False
          try_count = 0
@@ -490,7 +513,7 @@ class InfoExtractor(object):
  
      def _download_xml(self, url_or_request, video_id,
                        note='Downloading XML', errnote='Unable to download XML',
-                      transform_source=None, fatal=True, encoding=None, data=None, headers=None, query=None):
+                      transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
          """Return the xml as an xml.etree.ElementTree.Element"""
          xml_string = self._download_webpage(
              url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
@@ -504,7 +527,7 @@ class InfoExtractor(object):
                         note='Downloading JSON metadata',
                         errnote='Unable to download JSON metadata',
                         transform_source=None,
-                       fatal=True, encoding=None, data=None, headers=None, query=None):
+                       fatal=True, encoding=None, data=None, headers={}, query={}):
          json_string = self._download_webpage(
              url_or_request, video_id, note, errnote, fatal=fatal,
              encoding=encoding, data=data, headers=headers, query=query)
@@ -819,7 +842,7 @@ class InfoExtractor(object):
          for input in re.findall(r'(?i)<input([^>]+)>', html):
              if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
                  continue
-            name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
+            name = re.search(r'(?:name|id)=(["\'])(?P<value>.+?)\1', input)
              if not name:
                  continue
              value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
@@ -862,6 +885,7 @@ class InfoExtractor(object):
              proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1
  
              if f.get('vcodec') == 'none':  # audio only
+                preference -= 50
                  if self._downloader.params.get('prefer_free_formats'):
                      ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
                  else:
@@ -872,6 +896,8 @@ class InfoExtractor(object):
                  except ValueError:
                      audio_ext_preference = -1
              else:
+                if f.get('acodec') == 'none':  # video only
+                    preference -= 40
                  if self._downloader.params.get('prefer_free_formats'):
                      ORDER = ['flv', 'mp4', 'webm']
                  else:
@@ -973,12 +999,31 @@ class InfoExtractor(object):
          if manifest is False:
              return []
  
+        return self._parse_f4m_formats(
+            manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
+            transform_source=transform_source, fatal=fatal)
+
+    def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
+                           transform_source=lambda s: fix_xml_ampersands(s).strip(),
+                           fatal=True):
+        # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
+        akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
+        if akamai_pv is not None and ';' in akamai_pv.text:
+            playerVerificationChallenge = akamai_pv.text.split(';')[0]
+            if playerVerificationChallenge.strip() != '':
+                return []
+
          formats = []
          manifest_version = '1.0'
          media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
          if not media_nodes:
              manifest_version = '2.0'
              media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
+        # Remove unsupported DRM protected media from final formats
+        # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
+        media_nodes = remove_encrypted_media(media_nodes)
+        if not media_nodes:
+            return formats
          base_url = xpath_text(
              manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
              'base URL', default=None)
@@ -998,7 +1043,8 @@ class InfoExtractor(object):
                  # bitrate in f4m downloader
                  if determine_ext(manifest_url) == 'f4m':
                      formats.extend(self._extract_f4m_formats(
-                        manifest_url, video_id, preference, f4m_id, fatal=fatal))
+                        manifest_url, video_id, preference=preference, f4m_id=f4m_id,
+                        transform_source=transform_source, fatal=fatal))
                      continue
              tbr = int_or_none(media_el.attrib.get('bitrate'))
              formats.append({
@@ -1010,8 +1056,6 @@ class InfoExtractor(object):
                  'height': int_or_none(media_el.attrib.get('height')),
                  'preference': preference,
              })
-        self._sort_formats(formats)
-
          return formats
  
      def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
@@ -1132,7 +1176,6 @@ class InfoExtractor(object):
                      last_media = None
                  formats.append(f)
                  last_info = {}
-        self._sort_formats(formats)
          return formats
  
      @staticmethod
@@ -1147,8 +1190,8 @@ class InfoExtractor(object):
                  out.append('{%s}%s' % (namespace, c))
          return '/'.join(out)
  
-    def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
-        smil = self._download_smil(smil_url, video_id, fatal=fatal)
+    def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
+        smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
  
          if smil is False:
              assert not fatal
@@ -1165,10 +1208,10 @@ class InfoExtractor(object):
              return {}
          return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
  
-    def _download_smil(self, smil_url, video_id, fatal=True):
+    def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
          return self._download_xml(
              smil_url, video_id, 'Downloading SMIL file',
-            'Unable to download SMIL file', fatal=fatal)
+            'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
  
      def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
          namespace = self._parse_smil_namespace(smil)
@@ -1289,7 +1332,7 @@ class InfoExtractor(object):
                          'plugin': 'flowplayer-3.2.0.1',
                      }
                  f4m_url += '&' if '?' in f4m_url else '?'
-                f4m_url += compat_urllib_parse.urlencode(f4m_params)
+                f4m_url += compat_urllib_parse_urlencode(f4m_params)
                  formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
                  continue
  
@@ -1306,8 +1349,6 @@ class InfoExtractor(object):
                  })
                  continue
  
-        self._sort_formats(formats)
-
          return formats
  
      def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
@@ -1318,7 +1359,7 @@ class InfoExtractor(object):
              if not src or src in urls:
                  continue
              urls.append(src)
-            ext = textstream.get('ext') or determine_ext(src) or mimetype2ext(textstream.get('type'))
+            ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
              lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
              subtitles.setdefault(lang, []).append({
                  'url': src,
@@ -1498,9 +1539,16 @@ class InfoExtractor(object):
                                  representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
                              media_template = representation_ms_info['media_template']
                              media_template = media_template.replace('$RepresentationID$', representation_id)
-                            media_template = re.sub(r'\$(Number|Bandwidth)(?:%(0\d+)d)?\$', r'%(\1)\2d', media_template)
+                            media_template = re.sub(r'\$(Number|Bandwidth)\$', r'%(\1)d', media_template)
+                            media_template = re.sub(r'\$(Number|Bandwidth)%([^$]+)\$', r'%(\1)\2', media_template)
                              media_template.replace('$$', '$')
-                            representation_ms_info['segment_urls'] = [media_template % {'Number': segment_number, 'Bandwidth': representation_attrib.get('bandwidth')} for segment_number in range(representation_ms_info['start_number'], representation_ms_info['total_number'] + representation_ms_info['start_number'])]
+                            representation_ms_info['segment_urls'] = [
+                                media_template % {
+                                    'Number': segment_number,
+                                    'Bandwidth': representation_attrib.get('bandwidth')}
+                                for segment_number in range(
+                                    representation_ms_info['start_number'],
+                                    representation_ms_info['total_number'] + representation_ms_info['start_number'])]
                          if 'segment_urls' in representation_ms_info:
                              f.update({
                                  'segment_urls': representation_ms_info['segment_urls'],
@@ -1525,7 +1573,6 @@ class InfoExtractor(object):
                              existing_format.update(f)
                      else:
                          self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
-        self._sort_formats(formats)
          return formats
  
      def _live_title(self, name):