Merge pull request #9492 from jwilk/teamcoco

[youtube-dl] / youtube_dl / extractor / common.py
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index 85ac0400c1e01e160828de9b6805437631cd61aa..0843d89af71f7b68f6b650c01a3f8edcffdc78b3 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -163,7 +163,7 @@ class InfoExtractor(object):
      description:    Full video description.
      uploader:       Full name of the video uploader.
      license:        License name the video is licensed under.
      description:    Full video description.
      uploader:       Full name of the video uploader.
      license:        License name the video is licensed under.
-    creator:        The main artist who created the video.
+    creator:        The creator of the video.
      release_date:   The date (YYYYMMDD) when the video was released.
      timestamp:      UNIX timestamp of the moment the video became available.
      upload_date:    Video upload date (YYYYMMDD).
      release_date:   The date (YYYYMMDD) when the video was released.
      timestamp:      UNIX timestamp of the moment the video became available.
      upload_date:    Video upload date (YYYYMMDD).
@@ -232,6 +232,24 @@ class InfoExtractor(object):
      episode_number: Number of the video episode within a season, as an integer.
      episode_id:     Id of the video episode, as a unicode string.
  
      episode_number: Number of the video episode within a season, as an integer.
      episode_id:     Id of the video episode, as a unicode string.
  
+    The following fields should only be used when the media is a track or a part of
+    a music album:
+
+    track:          Title of the track.
+    track_number:   Number of the track within an album or a disc, as an integer.
+    track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
+                    as a unicode string.
+    artist:         Artist(s) of the track.
+    genre:          Genre(s) of the track.
+    album:          Title of the album the track belongs to.
+    album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
+    album_artist:   List of all artists appeared on the album (e.g.
+                    "Ash Borer / Fell Voices" or "Various Artists", useful for splits
+                    and compilations).
+    disc_number:    Number of the disc or other physical medium the track belongs to,
+                    as an integer.
+    release_year:   Year (YYYY) when the album was released.
+
      Unless mentioned otherwise, the fields should be Unicode strings.
  
      Unless mentioned otherwise, None is equivalent to absence of information.
      Unless mentioned otherwise, the fields should be Unicode strings.
  
      Unless mentioned otherwise, None is equivalent to absence of information.
@@ -358,14 +376,13 @@ class InfoExtractor(object):
                  self.to_screen('%s' % (note,))
              else:
                  self.to_screen('%s: %s' % (video_id, note))
                  self.to_screen('%s' % (note,))
              else:
                  self.to_screen('%s: %s' % (video_id, note))
-        # data, headers and query params will be ignored for `Request` objects
          if isinstance(url_or_request, compat_urllib_request.Request):
              url_or_request = update_Request(
                  url_or_request, data=data, headers=headers, query=query)
          else:
              if query:
                  url_or_request = update_url_query(url_or_request, query)
          if isinstance(url_or_request, compat_urllib_request.Request):
              url_or_request = update_Request(
                  url_or_request, data=data, headers=headers, query=query)
          else:
              if query:
                  url_or_request = update_url_query(url_or_request, query)
-            if data or headers:
+            if data is not None or headers:
                  url_or_request = sanitized_Request(url_or_request, data, headers)
          try:
              return self._downloader.urlopen(url_or_request)
                  url_or_request = sanitized_Request(url_or_request, data, headers)
          try:
              return self._downloader.urlopen(url_or_request)
@@ -825,7 +842,7 @@ class InfoExtractor(object):
          for input in re.findall(r'(?i)<input([^>]+)>', html):
              if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
                  continue
          for input in re.findall(r'(?i)<input([^>]+)>', html):
              if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
                  continue
-            name = re.search(r'name=(["\'])(?P<value>.+?)\1', input)
+            name = re.search(r'(?:name|id)=(["\'])(?P<value>.+?)\1', input)
              if not name:
                  continue
              value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
              if not name:
                  continue
              value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
@@ -989,6 +1006,13 @@ class InfoExtractor(object):
      def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
                             transform_source=lambda s: fix_xml_ampersands(s).strip(),
                             fatal=True):
      def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
                             transform_source=lambda s: fix_xml_ampersands(s).strip(),
                             fatal=True):
+        # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
+        akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
+        if akamai_pv is not None and ';' in akamai_pv.text:
+            playerVerificationChallenge = akamai_pv.text.split(';')[0]
+            if playerVerificationChallenge.strip() != '':
+                return []
+
          formats = []
          manifest_version = '1.0'
          media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
          formats = []
          manifest_version = '1.0'
          media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
@@ -1037,7 +1061,7 @@ class InfoExtractor(object):
      def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
                                entry_protocol='m3u8', preference=None,
                                m3u8_id=None, note=None, errnote=None,
      def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
                                entry_protocol='m3u8', preference=None,
                                m3u8_id=None, note=None, errnote=None,
-                              fatal=True):
+                              fatal=True, live=False):
  
          formats = [{
              'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
  
          formats = [{
              'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
@@ -1115,7 +1139,11 @@ class InfoExtractor(object):
                  if m3u8_id:
                      format_id.append(m3u8_id)
                  last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
                  if m3u8_id:
                      format_id.append(m3u8_id)
                  last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None
-                format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
+                # Bandwidth of live streams may differ over time thus making
+                # format_id unpredictable. So it's better to keep provided
+                # format_id intact.
+                if not live:
+                    format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats)))
                  f = {
                      'format_id': '-'.join(format_id),
                      'url': format_url(line.strip()),
                  f = {
                      'format_id': '-'.join(format_id),
                      'url': format_url(line.strip()),
@@ -1335,7 +1363,7 @@ class InfoExtractor(object):
              if not src or src in urls:
                  continue
              urls.append(src)
              if not src or src in urls:
                  continue
              urls.append(src)
-            ext = textstream.get('ext') or determine_ext(src) or mimetype2ext(textstream.get('type'))
+            ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
              lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
              subtitles.setdefault(lang, []).append({
                  'url': src,
              lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
              subtitles.setdefault(lang, []).append({
                  'url': src,
@@ -1515,9 +1543,16 @@ class InfoExtractor(object):
                                  representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
                              media_template = representation_ms_info['media_template']
                              media_template = media_template.replace('$RepresentationID$', representation_id)
                                  representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
                              media_template = representation_ms_info['media_template']
                              media_template = media_template.replace('$RepresentationID$', representation_id)
-                            media_template = re.sub(r'\$(Number|Bandwidth)(?:%(0\d+)d)?\$', r'%(\1)\2d', media_template)
+                            media_template = re.sub(r'\$(Number|Bandwidth)\$', r'%(\1)d', media_template)
+                            media_template = re.sub(r'\$(Number|Bandwidth)%([^$]+)\$', r'%(\1)\2', media_template)
                              media_template.replace('$$', '$')
                              media_template.replace('$$', '$')
-                            representation_ms_info['segment_urls'] = [media_template % {'Number': segment_number, 'Bandwidth': representation_attrib.get('bandwidth')} for segment_number in range(representation_ms_info['start_number'], representation_ms_info['total_number'] + representation_ms_info['start_number'])]
+                            representation_ms_info['segment_urls'] = [
+                                media_template % {
+                                    'Number': segment_number,
+                                    'Bandwidth': representation_attrib.get('bandwidth')}
+                                for segment_number in range(
+                                    representation_ms_info['start_number'],
+                                    representation_ms_info['total_number'] + representation_ms_info['start_number'])]
                          if 'segment_urls' in representation_ms_info:
                              f.update({
                                  'segment_urls': representation_ms_info['segment_urls'],
                          if 'segment_urls' in representation_ms_info:
                              f.update({
                                  'segment_urls': representation_ms_info['segment_urls'],