[common] fix dash codec information for mixed videos and fragment url construction...

[youtube-dl] / youtube_dl / extractor / common.py
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index 140ccf23422c88b9fa4524f26b7fca5b06e1bda1..58da2702526be72dc9c9415919d97e34375f064a 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -59,6 +59,7 @@ from ..utils import (
      parse_m3u8_attributes,
      extract_attributes,
      parse_codecs,
+    urljoin,
  )
  
  
@@ -886,7 +887,7 @@ class InfoExtractor(object):
                          'url': e.get('contentUrl'),
                          'title': unescapeHTML(e.get('name')),
                          'description': unescapeHTML(e.get('description')),
-                        'thumbnail': e.get('thumbnailUrl'),
+                        'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
                          'duration': parse_duration(e.get('duration')),
                          'timestamp': unified_timestamp(e.get('uploadDate')),
                          'filesize': float_or_none(e.get('contentSize')),
@@ -1224,6 +1225,7 @@ class InfoExtractor(object):
                  'protocol': entry_protocol,
                  'preference': preference,
              }]
+        audio_groups = set()
          last_info = {}
          last_media = {}
          for line in m3u8_doc.splitlines():
@@ -1239,15 +1241,18 @@ class InfoExtractor(object):
                          for v in (media.get('GROUP-ID'), media.get('NAME')):
                              if v:
                                  format_id.append(v)
-                        formats.append({
+                        f = {
                              'format_id': '-'.join(format_id),
                              'url': format_url(media_url),
                              'language': media.get('LANGUAGE'),
-                            'vcodec': 'none' if media_type == 'AUDIO' else None,
                              'ext': ext,
                              'protocol': entry_protocol,
                              'preference': preference,
-                        })
+                        }
+                        if media_type == 'AUDIO':
+                            f['vcodec'] = 'none'
+                            audio_groups.add(media['GROUP-ID'])
+                        formats.append(f)
                      else:
                          # When there is no URI in EXT-X-MEDIA let this tag's
                          # data be used by regular URI lines below
@@ -1280,9 +1285,10 @@ class InfoExtractor(object):
                  }
                  resolution = last_info.get('RESOLUTION')
                  if resolution:
-                    width_str, height_str = resolution.split('x')
-                    f['width'] = int(width_str)
-                    f['height'] = int(height_str)
+                    mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
+                    if mobj:
+                        f['width'] = int(mobj.group('width'))
+                        f['height'] = int(mobj.group('height'))
                  # Unified Streaming Platform
                  mobj = re.search(
                      r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
@@ -1294,6 +1300,9 @@ class InfoExtractor(object):
                          'abr': abr,
                      })
                  f.update(parse_codecs(last_info.get('CODECS')))
+                if last_info.get('AUDIO') in audio_groups:
+                    # TODO: update acodec for for audio only formats with the same GROUP-ID
+                    f['acodec'] = 'none'
                  formats.append(f)
                  last_info = {}
                  last_media = {}
@@ -1623,11 +1632,6 @@ class InfoExtractor(object):
                          extract_Initialization(segment_template)
              return ms_info
  
-        def combine_url(base_url, target_url):
-            if re.match(r'^https?://', target_url):
-                return target_url
-            return '%s%s%s' % (base_url, '' if base_url.endswith('/') else '/', target_url)
-
          mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
          formats = []
          for period in mpd_doc.findall(_add_ns('Period')):
@@ -1677,12 +1681,11 @@ class InfoExtractor(object):
                              'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
                              'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
                              'fps': int_or_none(representation_attrib.get('frameRate')),
-                            'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
-                            'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
                              'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
                              'format_note': 'DASH %s' % content_type,
                              'filesize': filesize,
                          }
+                        f.update(parse_codecs(representation_attrib.get('codecs')))
                          representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
                          if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
  
@@ -1702,7 +1705,7 @@ class InfoExtractor(object):
                                  representation_ms_info['fragments'] = [{
                                      'url': media_template % {
                                          'Number': segment_number,
-                                        'Bandwidth': representation_attrib.get('bandwidth'),
+                                        'Bandwidth': int_or_none(representation_attrib.get('bandwidth')),
                                      },
                                      'duration': segment_duration,
                                  } for segment_number in range(
@@ -1720,7 +1723,7 @@ class InfoExtractor(object):
                                  def add_segment_url():
                                      segment_url = media_template % {
                                          'Time': segment_time,
-                                        'Bandwidth': representation_attrib.get('bandwidth'),
+                                        'Bandwidth': int_or_none(representation_attrib.get('bandwidth')),
                                          'Number': segment_number,
                                      }
                                      representation_ms_info['fragments'].append({
@@ -1766,7 +1769,7 @@ class InfoExtractor(object):
                                  f['fragments'].append({'url': initialization_url})
                              f['fragments'].extend(representation_ms_info['fragments'])
                              for fragment in f['fragments']:
-                                fragment['url'] = combine_url(base_url, fragment['url'])
+                                fragment['url'] = urljoin(base_url, fragment['url'])
                          try:
                              existing_format = next(
                                  fo for fo in formats
@@ -1798,8 +1801,6 @@ class InfoExtractor(object):
          if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
              return []
  
-        ism_base_url = base_url(ism_url)
-
          duration = int(ism_doc.attrib['Duration'])
          timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
  
@@ -1839,10 +1840,10 @@ class InfoExtractor(object):
                              next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
                          except IndexError:
                              next_fragment_time = duration
-                        fragment_ctx['duration'] = (next_fragment_time - frgament_time) / fragment_repeat
+                        fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
                      for _ in range(fragment_repeat):
                          fragments.append({
-                            'url': re.sub(r'{start[ _]time}', str(fragment_ctx['time']), track_url_pattern),
+                            'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
                              'duration': fragment_ctx['duration'] / stream_timescale,
                          })
                          fragment_ctx['time'] += fragment_ctx['duration']
@@ -1882,7 +1883,7 @@ class InfoExtractor(object):
                  })
          return formats
  
-    def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8'):
+    def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None):
          def absolute_url(video_url):
              return compat_urlparse.urljoin(base_url, video_url)
  
@@ -1899,11 +1900,16 @@ class InfoExtractor(object):
  
          def _media_formats(src, cur_media_type):
              full_url = absolute_url(src)
-            if determine_ext(full_url) == 'm3u8':
+            ext = determine_ext(full_url)
+            if ext == 'm3u8':
                  is_plain_url = False
                  formats = self._extract_m3u8_formats(
                      full_url, video_id, ext='mp4',
                      entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id)
+            elif ext == 'mpd':
+                is_plain_url = False
+                formats = self._extract_mpd_formats(
+                    full_url, video_id, mpd_id=mpd_id)
              else:
                  is_plain_url = True
                  formats = [{