[extractor/common] Improve mpd base URL extraction (closes #10909, closes #11079)

[youtube-dl] / youtube_dl / extractor / common.py
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index a9c7a8d164d7e7b126eccc78a23462b44dfb96ac..68b325fca3971790c88d4be0f6b3f46a644f6c89 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -21,6 +21,7 @@ from ..compat import (
      compat_os_name,
      compat_str,
      compat_urllib_error,
+    compat_urllib_parse_unquote,
      compat_urllib_parse_urlencode,
      compat_urllib_request,
      compat_urlparse,
@@ -87,6 +88,9 @@ class InfoExtractor(object):
  
                      Potential fields:
                      * url        Mandatory. The URL of the video file
+                    * manifest_url
+                                 The URL of the manifest file in case of
+                                 fragmented media (DASH, hls, hds)
                      * ext        Will be calculated from URL if missing
                      * format     A human-readable description of the format
                                   ("mp4 container with h264/opus").
@@ -115,6 +119,11 @@ class InfoExtractor(object):
                                   download, lower-case.
                                   "http", "https", "rtsp", "rtmp", "rtmpe",
                                   "m3u8", "m3u8_native" or "http_dash_segments".
+                    * fragments  A list of fragments of the fragmented media,
+                                 with the following entries:
+                                 * "url" (mandatory) - fragment's URL
+                                 * "duration" (optional, int or float)
+                                 * "filesize" (optional, int)
                      * preference Order number of this format. If this field is
                                   present and not None, the formats get sorted
                                   by this field, regardless of all other values.
@@ -226,7 +235,7 @@ class InfoExtractor(object):
      chapter_id:     Id of the chapter the video belongs to, as a unicode string.
  
      The following fields should only be used when the video is an episode of some
-    series or programme:
+    series, programme or podcast:
  
      series:         Title of the series or programme the video episode belongs to.
      season:         Title of the season the video episode belongs to.
@@ -674,33 +683,36 @@ class InfoExtractor(object):
                      username = info[0]
                      password = info[2]
                  else:
-                    raise netrc.NetrcParseError('No authenticators for %s' % netrc_machine)
+                    raise netrc.NetrcParseError(
+                        'No authenticators for %s' % netrc_machine)
              except (IOError, netrc.NetrcParseError) as err:
-                self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
+                self._downloader.report_warning(
+                    'parsing .netrc: %s' % error_to_compat_str(err))
  
-        return (username, password)
+        return username, password
  
-    def _get_login_info(self):
+    def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
          """
          Get the login info as (username, password)
-        It will look in the netrc file using the _NETRC_MACHINE value
+        First look for the manually specified credentials using username_option
+        and password_option as keys in params dictionary. If no such credentials
+        available look in the netrc file using the netrc_machine or _NETRC_MACHINE
+        value.
          If there's no info available, return (None, None)
          """
          if self._downloader is None:
              return (None, None)
  
-        username = None
-        password = None
          downloader_params = self._downloader.params
  
          # Attempt to use provided username and password or .netrc data
-        if downloader_params.get('username') is not None:
-            username = downloader_params['username']
-            password = downloader_params['password']
+        if downloader_params.get(username_option) is not None:
+            username = downloader_params[username_option]
+            password = downloader_params[password_option]
          else:
-            username, password = self._get_netrc_login_info()
+            username, password = self._get_netrc_login_info(netrc_machine)
  
-        return (username, password)
+        return username, password
  
      def _get_tfa_info(self, note='two-factor verification code'):
          """
@@ -888,16 +900,16 @@ class InfoExtractor(object):
      def _hidden_inputs(html):
          html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
          hidden_inputs = {}
-        for input in re.findall(r'(?i)<input([^>]+)>', html):
-            if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
-                continue
-            name = re.search(r'(?:name|id)=(["\'])(?P<value>.+?)\1', input)
-            if not name:
+        for input in re.findall(r'(?i)(<input[^>]+>)', html):
+            attrs = extract_attributes(input)
+            if not input:
                  continue
-            value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
-            if not value:
+            if attrs.get('type') not in ('hidden', 'submit'):
                  continue
-            hidden_inputs[name.group('value')] = value.group('value')
+            name = attrs.get('name') or attrs.get('id')
+            value = attrs.get('value')
+            if name and value is not None:
+                hidden_inputs[name] = value
          return hidden_inputs
  
      def _form_hidden_inputs(self, form_id, html):
@@ -1088,6 +1100,13 @@ class InfoExtractor(object):
              manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
              'bootstrap info', default=None)
  
+        vcodec = None
+        mime_type = xpath_text(
+            manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
+            'base URL', default=None)
+        if mime_type and mime_type.startswith('audio/'):
+            vcodec = 'none'
+
          for i, media_el in enumerate(media_nodes):
              tbr = int_or_none(media_el.attrib.get('bitrate'))
              width = int_or_none(media_el.attrib.get('width'))
@@ -1128,6 +1147,7 @@ class InfoExtractor(object):
                              'width': f.get('width') or width,
                              'height': f.get('height') or height,
                              'format_id': f.get('format_id') if not tbr else format_id,
+                            'vcodec': vcodec,
                          })
                      formats.extend(f4m_formats)
                      continue
@@ -1139,10 +1159,12 @@ class InfoExtractor(object):
              formats.append({
                  'format_id': format_id,
                  'url': manifest_url,
+                'manifest_url': manifest_url,
                  'ext': 'flv' if bootstrap_info is not None else None,
                  'tbr': tbr,
                  'width': width,
                  'height': height,
+                'vcodec': vcodec,
                  'preference': preference,
              })
          return formats
@@ -1163,13 +1185,6 @@ class InfoExtractor(object):
                                m3u8_id=None, note=None, errnote=None,
                                fatal=True, live=False):
  
-        formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
-
-        format_url = lambda u: (
-            u
-            if re.match(r'^https?://', u)
-            else compat_urlparse.urljoin(m3u8_url, u))
-
          res = self._download_webpage_handle(
              m3u8_url, video_id,
              note=note or 'Downloading m3u8 information',
@@ -1180,6 +1195,13 @@ class InfoExtractor(object):
          m3u8_doc, urlh = res
          m3u8_url = urlh.geturl()
  
+        formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
+
+        format_url = lambda u: (
+            u
+            if re.match(r'^https?://', u)
+            else compat_urlparse.urljoin(m3u8_url, u))
+
          # We should try extracting formats only from master playlists [1], i.e.
          # playlists that describe available qualities. On the other hand media
          # playlists [2] should be returned as is since they contain just the media
@@ -1244,9 +1266,11 @@ class InfoExtractor(object):
                  # format_id intact.
                  if not live:
                      format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
+                manifest_url = format_url(line.strip())
                  f = {
                      'format_id': '-'.join(format_id),
-                    'url': format_url(line.strip()),
+                    'url': manifest_url,
+                    'manifest_url': manifest_url,
                      'tbr': tbr,
                      'ext': ext,
                      'fps': float_or_none(last_info.get('FRAME-RATE')),
@@ -1515,12 +1539,13 @@ class InfoExtractor(object):
          if res is False:
              return []
          mpd, urlh = res
-        mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
+        mpd_base_url = re.match(r'https?://[^?#&]+/', urlh.geturl()).group()
  
          return self._parse_mpd_formats(
-            compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
+            compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url,
+            formats_dict=formats_dict, mpd_url=mpd_url)
  
-    def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
+    def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
          """
          Parse formats from MPD manifest.
          References:
@@ -1541,42 +1566,52 @@ class InfoExtractor(object):
  
          def extract_multisegment_info(element, ms_parent_info):
              ms_info = ms_parent_info.copy()
+
+            # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
+            # common attributes and elements.  We will only extract relevant
+            # for us.
+            def extract_common(source):
+                segment_timeline = source.find(_add_ns('SegmentTimeline'))
+                if segment_timeline is not None:
+                    s_e = segment_timeline.findall(_add_ns('S'))
+                    if s_e:
+                        ms_info['total_number'] = 0
+                        ms_info['s'] = []
+                        for s in s_e:
+                            r = int(s.get('r', 0))
+                            ms_info['total_number'] += 1 + r
+                            ms_info['s'].append({
+                                't': int(s.get('t', 0)),
+                                # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
+                                'd': int(s.attrib['d']),
+                                'r': r,
+                            })
+                start_number = source.get('startNumber')
+                if start_number:
+                    ms_info['start_number'] = int(start_number)
+                timescale = source.get('timescale')
+                if timescale:
+                    ms_info['timescale'] = int(timescale)
+                segment_duration = source.get('duration')
+                if segment_duration:
+                    ms_info['segment_duration'] = int(segment_duration)
+
+            def extract_Initialization(source):
+                initialization = source.find(_add_ns('Initialization'))
+                if initialization is not None:
+                    ms_info['initialization_url'] = initialization.attrib['sourceURL']
+
              segment_list = element.find(_add_ns('SegmentList'))
              if segment_list is not None:
+                extract_common(segment_list)
+                extract_Initialization(segment_list)
                  segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
                  if segment_urls_e:
                      ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
-                initialization = segment_list.find(_add_ns('Initialization'))
-                if initialization is not None:
-                    ms_info['initialization_url'] = initialization.attrib['sourceURL']
              else:
                  segment_template = element.find(_add_ns('SegmentTemplate'))
                  if segment_template is not None:
-                    start_number = segment_template.get('startNumber')
-                    if start_number:
-                        ms_info['start_number'] = int(start_number)
-                    segment_timeline = segment_template.find(_add_ns('SegmentTimeline'))
-                    if segment_timeline is not None:
-                        s_e = segment_timeline.findall(_add_ns('S'))
-                        if s_e:
-                            ms_info['total_number'] = 0
-                            ms_info['s'] = []
-                            for s in s_e:
-                                r = int(s.get('r', 0))
-                                ms_info['total_number'] += 1 + r
-                                ms_info['s'].append({
-                                    't': int(s.get('t', 0)),
-                                    # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
-                                    'd': int(s.attrib['d']),
-                                    'r': r,
-                                })
-                    else:
-                        timescale = segment_template.get('timescale')
-                        if timescale:
-                            ms_info['timescale'] = int(timescale)
-                        segment_duration = segment_template.get('duration')
-                        if segment_duration:
-                            ms_info['segment_duration'] = int(segment_duration)
+                    extract_common(segment_template)
                      media_template = segment_template.get('media')
                      if media_template:
                          ms_info['media_template'] = media_template
@@ -1584,11 +1619,14 @@ class InfoExtractor(object):
                      if initialization:
                          ms_info['initialization_url'] = initialization
                      else:
-                        initialization = segment_template.find(_add_ns('Initialization'))
-                        if initialization is not None:
-                            ms_info['initialization_url'] = initialization.attrib['sourceURL']
+                        extract_Initialization(segment_template)
              return ms_info
  
+        def combine_url(base_url, target_url):
+            if re.match(r'^https?://', target_url):
+                return target_url
+            return '%s%s%s' % (base_url, '' if base_url.endswith('/') else '/', target_url)
+
          mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
          formats = []
          for period in mpd_doc.findall(_add_ns('Period')):
@@ -1631,6 +1669,7 @@ class InfoExtractor(object):
                          f = {
                              'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
                              'url': base_url,
+                            'manifest_url': mpd_url,
                              'ext': mimetype2ext(mime_type),
                              'width': int_or_none(representation_attrib.get('width')),
                              'height': int_or_none(representation_attrib.get('height')),
@@ -1645,9 +1684,7 @@ class InfoExtractor(object):
                          }
                          representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
                          if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
-                            if 'total_number' not in representation_ms_info and 'segment_duration':
-                                segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale'])
-                                representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
+
                              media_template = representation_ms_info['media_template']
                              media_template = media_template.replace('$RepresentationID$', representation_id)
                              media_template = re.sub(r'\$(Number|Bandwidth|Time)\$', r'%(\1)d', media_template)
@@ -1656,46 +1693,79 @@ class InfoExtractor(object):
  
                              # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
                              # can't be used at the same time
-                            if '%(Number' in media_template:
-                                representation_ms_info['segment_urls'] = [
-                                    media_template % {
+                            if '%(Number' in media_template and 's' not in representation_ms_info:
+                                segment_duration = None
+                                if 'total_number' not in representation_ms_info and 'segment_duration':
+                                    segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
+                                    representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
+                                representation_ms_info['fragments'] = [{
+                                    'url': media_template % {
                                          'Number': segment_number,
                                          'Bandwidth': representation_attrib.get('bandwidth'),
-                                    }
-                                    for segment_number in range(
-                                        representation_ms_info['start_number'],
-                                        representation_ms_info['total_number'] + representation_ms_info['start_number'])]
+                                    },
+                                    'duration': segment_duration,
+                                } for segment_number in range(
+                                    representation_ms_info['start_number'],
+                                    representation_ms_info['total_number'] + representation_ms_info['start_number'])]
                              else:
-                                representation_ms_info['segment_urls'] = []
+                                # $Number*$ or $Time$ in media template with S list available
+                                # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
+                                # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
+                                representation_ms_info['fragments'] = []
                                  segment_time = 0
+                                segment_d = None
+                                segment_number = representation_ms_info['start_number']
  
                                  def add_segment_url():
-                                    representation_ms_info['segment_urls'].append(
-                                        media_template % {
-                                            'Time': segment_time,
-                                            'Bandwidth': representation_attrib.get('bandwidth'),
-                                        }
-                                    )
+                                    segment_url = media_template % {
+                                        'Time': segment_time,
+                                        'Bandwidth': representation_attrib.get('bandwidth'),
+                                        'Number': segment_number,
+                                    }
+                                    representation_ms_info['fragments'].append({
+                                        'url': segment_url,
+                                        'duration': float_or_none(segment_d, representation_ms_info['timescale']),
+                                    })
  
                                  for num, s in enumerate(representation_ms_info['s']):
                                      segment_time = s.get('t') or segment_time
+                                    segment_d = s['d']
                                      add_segment_url()
+                                    segment_number += 1
                                      for r in range(s.get('r', 0)):
-                                        segment_time += s['d']
+                                        segment_time += segment_d
                                          add_segment_url()
-                                    segment_time += s['d']
-                        if 'segment_urls' in representation_ms_info:
+                                        segment_number += 1
+                                    segment_time += segment_d
+                        elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
+                            # No media template
+                            # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
+                            # or any YouTube dashsegments video
+                            fragments = []
+                            s_num = 0
+                            for segment_url in representation_ms_info['segment_urls']:
+                                s = representation_ms_info['s'][s_num]
+                                for r in range(s.get('r', 0) + 1):
+                                    fragments.append({
+                                        'url': segment_url,
+                                        'duration': float_or_none(s['d'], representation_ms_info['timescale']),
+                                    })
+                            representation_ms_info['fragments'] = fragments
+                        # NB: MPD manifest may contain direct URLs to unfragmented media.
+                        # No fragments key is present in this case.
+                        if 'fragments' in representation_ms_info:
                              f.update({
-                                'segment_urls': representation_ms_info['segment_urls'],
+                                'fragments': [],
                                  'protocol': 'http_dash_segments',
                              })
                              if 'initialization_url' in representation_ms_info:
                                  initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id)
-                                f.update({
-                                    'initialization_url': initialization_url,
-                                })
                                  if not f.get('url'):
                                      f['url'] = initialization_url
+                                f['fragments'].append({'url': initialization_url})
+                            f['fragments'].extend(representation_ms_info['fragments'])
+                            for fragment in f['fragments']:
+                                fragment['url'] = combine_url(base_url, fragment['url'])
                          try:
                              existing_format = next(
                                  fo for fo in formats
@@ -1741,7 +1811,11 @@ class InfoExtractor(object):
              return is_plain_url, formats
  
          entries = []
-        for media_tag, media_type, media_content in re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage):
+        media_tags = [(media_tag, media_type, '')
+                      for media_tag, media_type
+                      in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)]
+        media_tags.extend(re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage))
+        for media_tag, media_type, media_content in media_tags:
              media_info = {
                  'formats': [],
                  'subtitles': {},
@@ -1749,7 +1823,7 @@ class InfoExtractor(object):
              media_attributes = extract_attributes(media_tag)
              src = media_attributes.get('src')
              if src:
-                _, formats = _media_formats(src)
+                _, formats = _media_formats(src, media_type)
                  media_info['formats'].extend(formats)
              media_info['thumbnail'] = media_attributes.get('poster')
              if media_content:
@@ -1768,7 +1842,7 @@ class InfoExtractor(object):
                  for track_tag in re.findall(r'<track[^>]+>', media_content):
                      track_attributes = extract_attributes(track_tag)
                      kind = track_attributes.get('kind')
-                    if not kind or kind == 'subtitles':
+                    if not kind or kind in ('subtitles', 'captions'):
                          src = track_attributes.get('src')
                          if not src:
                              continue
@@ -1776,22 +1850,70 @@ class InfoExtractor(object):
                          media_info['subtitles'].setdefault(lang, []).append({
                              'url': absolute_url(src),
                          })
-            if media_info['formats']:
+            if media_info['formats'] or media_info['subtitles']:
                  entries.append(media_info)
          return entries
  
      def _extract_akamai_formats(self, manifest_url, video_id):
          formats = []
+        hdcore_sign = 'hdcore=3.7.0'
          f4m_url = re.sub(r'(https?://.+?)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
-        formats.extend(self._extract_f4m_formats(
-            update_url_query(f4m_url, {'hdcore': '3.7.0'}),
-            video_id, f4m_id='hds', fatal=False))
+        if 'hdcore=' not in f4m_url:
+            f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
+        f4m_formats = self._extract_f4m_formats(
+            f4m_url, video_id, f4m_id='hds', fatal=False)
+        for entry in f4m_formats:
+            entry.update({'extra_param_to_segment_url': hdcore_sign})
+        formats.extend(f4m_formats)
          m3u8_url = re.sub(r'(https?://.+?)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
          formats.extend(self._extract_m3u8_formats(
              m3u8_url, video_id, 'mp4', 'm3u8_native',
              m3u8_id='hls', fatal=False))
          return formats
  
+    def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
+        url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
+        url_base = self._search_regex(r'(?:https?|rtmp|rtsp)(://[^?]+)', url, 'format url')
+        http_base_url = 'http' + url_base
+        formats = []
+        if 'm3u8' not in skip_protocols:
+            formats.extend(self._extract_m3u8_formats(
+                http_base_url + '/playlist.m3u8', video_id, 'mp4',
+                m3u8_entry_protocol, m3u8_id='hls', fatal=False))
+        if 'f4m' not in skip_protocols:
+            formats.extend(self._extract_f4m_formats(
+                http_base_url + '/manifest.f4m',
+                video_id, f4m_id='hds', fatal=False))
+        if 'dash' not in skip_protocols:
+            formats.extend(self._extract_mpd_formats(
+                http_base_url + '/manifest.mpd',
+                video_id, mpd_id='dash', fatal=False))
+        if re.search(r'(?:/smil:|\.smil)', url_base):
+            if 'smil' not in skip_protocols:
+                rtmp_formats = self._extract_smil_formats(
+                    http_base_url + '/jwplayer.smil',
+                    video_id, fatal=False)
+                for rtmp_format in rtmp_formats:
+                    rtsp_format = rtmp_format.copy()
+                    rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
+                    del rtsp_format['play_path']
+                    del rtsp_format['ext']
+                    rtsp_format.update({
+                        'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
+                        'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
+                        'protocol': 'rtsp',
+                    })
+                    formats.extend([rtmp_format, rtsp_format])
+        else:
+            for protocol in ('rtmp', 'rtsp'):
+                if protocol not in skip_protocols:
+                    formats.append({
+                        'url': protocol + url_base,
+                        'format_id': protocol,
+                        'protocol': protocol,
+                    })
+        return formats
+
      def _live_title(self, name):
          """ Generate the title for a live video """
          now = datetime.datetime.now()
@@ -1912,6 +2034,12 @@ class InfoExtractor(object):
              headers['Ytdl-request-proxy'] = geo_verification_proxy
          return headers
  
+    def _generic_id(self, url):
+        return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
+
+    def _generic_title(self, url):
+        return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
+
  
  class SearchInfoExtractor(InfoExtractor):
      """