[youtube:playlist] Fix nonexistent/private playlist detection and skip private tests

[youtube-dl] / youtube_dl / extractor / common.py
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index 1076b46da773b5c90cf0c898202f9a8fc5279dbf..dce8c7d0d5ad389aa84bf84f25731a2e680e91e3 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -21,6 +21,7 @@ from ..compat import (
      compat_os_name,
      compat_str,
      compat_urllib_error,
+    compat_urllib_parse_unquote,
      compat_urllib_parse_urlencode,
      compat_urllib_request,
      compat_urlparse,
@@ -29,6 +30,7 @@ from ..downloader.f4m import remove_encrypted_media
  from ..utils import (
      NO_DEFAULT,
      age_restricted,
+    base_url,
      bug_reports_message,
      clean_html,
      compiled_regex_type,
@@ -57,6 +59,7 @@ from ..utils import (
      parse_m3u8_attributes,
      extract_attributes,
      parse_codecs,
+    urljoin,
  )
  
  
@@ -186,9 +189,10 @@ class InfoExtractor(object):
      uploader_url:   Full URL to a personal webpage of the video uploader.
      location:       Physical location where the video was filmed.
      subtitles:      The available subtitles as a dictionary in the format
-                    {language: subformats}. "subformats" is a list sorted from
-                    lower to higher preference, each element is a dictionary
-                    with the "ext" entry and one of:
+                    {tag: subformats}. "tag" is usually a language code, and
+                    "subformats" is a list sorted from lower to higher
+                    preference, each element is a dictionary with the "ext"
+                    entry and one of:
                          * "data": The subtitles file contents
                          * "url": A URL pointing to the subtitles file
                      "ext" will be calculated from URL if missing
@@ -234,7 +238,7 @@ class InfoExtractor(object):
      chapter_id:     Id of the chapter the video belongs to, as a unicode string.
  
      The following fields should only be used when the video is an episode of some
-    series or programme:
+    series, programme or podcast:
  
      series:         Title of the series or programme the video episode belongs to.
      season:         Title of the season the video episode belongs to.
@@ -884,7 +888,7 @@ class InfoExtractor(object):
                          'url': e.get('contentUrl'),
                          'title': unescapeHTML(e.get('name')),
                          'description': unescapeHTML(e.get('description')),
-                        'thumbnail': e.get('thumbnailUrl'),
+                        'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
                          'duration': parse_duration(e.get('duration')),
                          'timestamp': unified_timestamp(e.get('uploadDate')),
                          'filesize': float_or_none(e.get('contentSize')),
@@ -1099,6 +1103,13 @@ class InfoExtractor(object):
              manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
              'bootstrap info', default=None)
  
+        vcodec = None
+        mime_type = xpath_text(
+            manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
+            'base URL', default=None)
+        if mime_type and mime_type.startswith('audio/'):
+            vcodec = 'none'
+
          for i, media_el in enumerate(media_nodes):
              tbr = int_or_none(media_el.attrib.get('bitrate'))
              width = int_or_none(media_el.attrib.get('width'))
@@ -1139,6 +1150,7 @@ class InfoExtractor(object):
                              'width': f.get('width') or width,
                              'height': f.get('height') or height,
                              'format_id': f.get('format_id') if not tbr else format_id,
+                            'vcodec': vcodec,
                          })
                      formats.extend(f4m_formats)
                      continue
@@ -1155,6 +1167,7 @@ class InfoExtractor(object):
                  'tbr': tbr,
                  'width': width,
                  'height': height,
+                'vcodec': vcodec,
                  'preference': preference,
              })
          return formats
@@ -1213,6 +1226,7 @@ class InfoExtractor(object):
                  'protocol': entry_protocol,
                  'preference': preference,
              }]
+        audio_in_video_stream = {}
          last_info = {}
          last_media = {}
          for line in m3u8_doc.splitlines():
@@ -1222,25 +1236,32 @@ class InfoExtractor(object):
                  media = parse_m3u8_attributes(line)
                  media_type = media.get('TYPE')
                  if media_type in ('VIDEO', 'AUDIO'):
+                    group_id = media.get('GROUP-ID')
                      media_url = media.get('URI')
                      if media_url:
                          format_id = []
-                        for v in (media.get('GROUP-ID'), media.get('NAME')):
+                        for v in (group_id, media.get('NAME')):
                              if v:
                                  format_id.append(v)
-                        formats.append({
+                        f = {
                              'format_id': '-'.join(format_id),
                              'url': format_url(media_url),
                              'language': media.get('LANGUAGE'),
-                            'vcodec': 'none' if media_type == 'AUDIO' else None,
                              'ext': ext,
                              'protocol': entry_protocol,
                              'preference': preference,
-                        })
+                        }
+                        if media_type == 'AUDIO':
+                            f['vcodec'] = 'none'
+                            if group_id and not audio_in_video_stream.get(group_id):
+                                audio_in_video_stream[group_id] = False
+                        formats.append(f)
                      else:
                          # When there is no URI in EXT-X-MEDIA let this tag's
                          # data be used by regular URI lines below
                          last_media = media
+                        if media_type == 'AUDIO' and group_id:
+                            audio_in_video_stream[group_id] = True
              elif line.startswith('#') or not line.strip():
                  continue
              else:
@@ -1269,9 +1290,10 @@ class InfoExtractor(object):
                  }
                  resolution = last_info.get('RESOLUTION')
                  if resolution:
-                    width_str, height_str = resolution.split('x')
-                    f['width'] = int(width_str)
-                    f['height'] = int(height_str)
+                    mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
+                    if mobj:
+                        f['width'] = int(mobj.group('width'))
+                        f['height'] = int(mobj.group('height'))
                  # Unified Streaming Platform
                  mobj = re.search(
                      r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
@@ -1283,6 +1305,9 @@ class InfoExtractor(object):
                          'abr': abr,
                      })
                  f.update(parse_codecs(last_info.get('CODECS')))
+                if audio_in_video_stream.get(last_info.get('AUDIO')) is False:
+                    # TODO: update acodec for for audio only formats with the same GROUP-ID
+                    f['acodec'] = 'none'
                  formats.append(f)
                  last_info = {}
                  last_media = {}
@@ -1529,7 +1554,7 @@ class InfoExtractor(object):
          if res is False:
              return []
          mpd, urlh = res
-        mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group()
+        mpd_base_url = base_url(urlh.geturl())
  
          return self._parse_mpd_formats(
              compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url,
@@ -1612,11 +1637,6 @@ class InfoExtractor(object):
                          extract_Initialization(segment_template)
              return ms_info
  
-        def combine_url(base_url, target_url):
-            if re.match(r'^https?://', target_url):
-                return target_url
-            return '%s%s%s' % (base_url, '' if base_url.endswith('/') else '/', target_url)
-
          mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
          formats = []
          for period in mpd_doc.findall(_add_ns('Period')):
@@ -1666,12 +1686,11 @@ class InfoExtractor(object):
                              'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000),
                              'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
                              'fps': int_or_none(representation_attrib.get('frameRate')),
-                            'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'),
-                            'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'),
                              'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
                              'format_note': 'DASH %s' % content_type,
                              'filesize': filesize,
                          }
+                        f.update(parse_codecs(representation_attrib.get('codecs')))
                          representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
                          if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info:
  
@@ -1691,7 +1710,7 @@ class InfoExtractor(object):
                                  representation_ms_info['fragments'] = [{
                                      'url': media_template % {
                                          'Number': segment_number,
-                                        'Bandwidth': representation_attrib.get('bandwidth'),
+                                        'Bandwidth': int_or_none(representation_attrib.get('bandwidth')),
                                      },
                                      'duration': segment_duration,
                                  } for segment_number in range(
@@ -1709,7 +1728,7 @@ class InfoExtractor(object):
                                  def add_segment_url():
                                      segment_url = media_template % {
                                          'Time': segment_time,
-                                        'Bandwidth': representation_attrib.get('bandwidth'),
+                                        'Bandwidth': int_or_none(representation_attrib.get('bandwidth')),
                                          'Number': segment_number,
                                      }
                                      representation_ms_info['fragments'].append({
@@ -1755,7 +1774,7 @@ class InfoExtractor(object):
                                  f['fragments'].append({'url': initialization_url})
                              f['fragments'].extend(representation_ms_info['fragments'])
                              for fragment in f['fragments']:
-                                fragment['url'] = combine_url(base_url, fragment['url'])
+                                fragment['url'] = urljoin(base_url, fragment['url'])
                          try:
                              existing_format = next(
                                  fo for fo in formats
@@ -1770,7 +1789,106 @@ class InfoExtractor(object):
                          self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
          return formats
  
-    def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8'):
+    def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
+        res = self._download_webpage_handle(
+            ism_url, video_id,
+            note=note or 'Downloading ISM manifest',
+            errnote=errnote or 'Failed to download ISM manifest',
+            fatal=fatal)
+        if res is False:
+            return []
+        ism, urlh = res
+
+        return self._parse_ism_formats(
+            compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)
+
+    def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
+        if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
+            return []
+
+        duration = int(ism_doc.attrib['Duration'])
+        timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
+
+        formats = []
+        for stream in ism_doc.findall('StreamIndex'):
+            stream_type = stream.get('Type')
+            if stream_type not in ('video', 'audio'):
+                continue
+            url_pattern = stream.attrib['Url']
+            stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
+            stream_name = stream.get('Name')
+            for track in stream.findall('QualityLevel'):
+                fourcc = track.get('FourCC')
+                # TODO: add support for WVC1 and WMAP
+                if fourcc not in ('H264', 'AVC1', 'AACL'):
+                    self.report_warning('%s is not a supported codec' % fourcc)
+                    continue
+                tbr = int(track.attrib['Bitrate']) // 1000
+                width = int_or_none(track.get('MaxWidth'))
+                height = int_or_none(track.get('MaxHeight'))
+                sampling_rate = int_or_none(track.get('SamplingRate'))
+
+                track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
+                track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
+
+                fragments = []
+                fragment_ctx = {
+                    'time': 0,
+                }
+                stream_fragments = stream.findall('c')
+                for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
+                    fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
+                    fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
+                    fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
+                    if not fragment_ctx['duration']:
+                        try:
+                            next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
+                        except IndexError:
+                            next_fragment_time = duration
+                        fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
+                    for _ in range(fragment_repeat):
+                        fragments.append({
+                            'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
+                            'duration': fragment_ctx['duration'] / stream_timescale,
+                        })
+                        fragment_ctx['time'] += fragment_ctx['duration']
+
+                format_id = []
+                if ism_id:
+                    format_id.append(ism_id)
+                if stream_name:
+                    format_id.append(stream_name)
+                format_id.append(compat_str(tbr))
+
+                formats.append({
+                    'format_id': '-'.join(format_id),
+                    'url': ism_url,
+                    'manifest_url': ism_url,
+                    'ext': 'ismv' if stream_type == 'video' else 'isma',
+                    'width': width,
+                    'height': height,
+                    'tbr': tbr,
+                    'asr': sampling_rate,
+                    'vcodec': 'none' if stream_type == 'audio' else fourcc,
+                    'acodec': 'none' if stream_type == 'video' else fourcc,
+                    'protocol': 'ism',
+                    'fragments': fragments,
+                    '_download_params': {
+                        'duration': duration,
+                        'timescale': stream_timescale,
+                        'width': width or 0,
+                        'height': height or 0,
+                        'fourcc': fourcc,
+                        'codec_private_data': track.get('CodecPrivateData'),
+                        'sampling_rate': sampling_rate,
+                        'channels': int_or_none(track.get('Channels', 2)),
+                        'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
+                        'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
+                    },
+                })
+        return formats
+
+    def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None):
          def absolute_url(video_url):
              return compat_urlparse.urljoin(base_url, video_url)
  
@@ -1787,11 +1905,16 @@ class InfoExtractor(object):
  
          def _media_formats(src, cur_media_type):
              full_url = absolute_url(src)
-            if determine_ext(full_url) == 'm3u8':
+            ext = determine_ext(full_url)
+            if ext == 'm3u8':
                  is_plain_url = False
                  formats = self._extract_m3u8_formats(
                      full_url, video_id, ext='mp4',
                      entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id)
+            elif ext == 'mpd':
+                is_plain_url = False
+                formats = self._extract_mpd_formats(
+                    full_url, video_id, mpd_id=mpd_id)
              else:
                  is_plain_url = True
                  formats = [{
@@ -1801,7 +1924,11 @@ class InfoExtractor(object):
              return is_plain_url, formats
  
          entries = []
-        for media_tag, media_type, media_content in re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage):
+        media_tags = [(media_tag, media_type, '')
+                      for media_tag, media_type
+                      in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)]
+        media_tags.extend(re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage))
+        for media_tag, media_type, media_content in media_tags:
              media_info = {
                  'formats': [],
                  'subtitles': {},
@@ -1840,10 +1967,13 @@ class InfoExtractor(object):
                  entries.append(media_info)
          return entries
  
-    def _extract_akamai_formats(self, manifest_url, video_id):
+    def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
          formats = []
          hdcore_sign = 'hdcore=3.7.0'
-        f4m_url = re.sub(r'(https?://.+?)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
+        f4m_url = re.sub(r'(https?://[^/+])/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
+        hds_host = hosts.get('hds')
+        if hds_host:
+            f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
          if 'hdcore=' not in f4m_url:
              f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
          f4m_formats = self._extract_f4m_formats(
@@ -1851,7 +1981,10 @@ class InfoExtractor(object):
          for entry in f4m_formats:
              entry.update({'extra_param_to_segment_url': hdcore_sign})
          formats.extend(f4m_formats)
-        m3u8_url = re.sub(r'(https?://.+?)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
+        m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
+        hls_host = hosts.get('hls')
+        if hls_host:
+            m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
          formats.extend(self._extract_m3u8_formats(
              m3u8_url, video_id, 'mp4', 'm3u8_native',
              m3u8_id='hls', fatal=False))
@@ -1870,11 +2003,11 @@ class InfoExtractor(object):
              formats.extend(self._extract_f4m_formats(
                  http_base_url + '/manifest.f4m',
                  video_id, f4m_id='hds', fatal=False))
+        if 'dash' not in skip_protocols:
+            formats.extend(self._extract_mpd_formats(
+                http_base_url + '/manifest.mpd',
+                video_id, mpd_id='dash', fatal=False))
          if re.search(r'(?:/smil:|\.smil)', url_base):
-            if 'dash' not in skip_protocols:
-                formats.extend(self._extract_mpd_formats(
-                    http_base_url + '/manifest.mpd',
-                    video_id, mpd_id='dash', fatal=False))
              if 'smil' not in skip_protocols:
                  rtmp_formats = self._extract_smil_formats(
                      http_base_url + '/jwplayer.smil',
@@ -2020,6 +2153,12 @@ class InfoExtractor(object):
              headers['Ytdl-request-proxy'] = geo_verification_proxy
          return headers
  
+    def _generic_id(self, url):
+        return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
+
+    def _generic_title(self, url):
+        return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
+
  
  class SearchInfoExtractor(InfoExtractor):
      """