[extractor/common] Improve HTML5 entries extraction and add some realworld tests

[youtube-dl] / youtube_dl / extractor / common.py
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index a17f7cbc4e828dfeeb10586fd7d835629925f35c..0889288f0e47fe785ee27584c2e84023de420083 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -13,11 +13,11 @@ import socket
  import sys
  import time
  import math
-import xml
  
  from ..compat import (
      compat_cookiejar,
      compat_cookies,
+    compat_etree_Element,
      compat_etree_fromstring,
      compat_getpass,
      compat_integer_types,
@@ -44,6 +44,7 @@ from ..utils import (
      compiled_regex_type,
      determine_ext,
      determine_protocol,
+    dict_get,
      error_to_compat_str,
      ExtractorError,
      extract_attributes,
@@ -56,13 +57,16 @@ from ..utils import (
      JSON_LD_RE,
      mimetype2ext,
      orderedSet,
+    parse_bitrate,
      parse_codecs,
      parse_duration,
      parse_iso8601,
      parse_m3u8_attributes,
+    parse_resolution,
      RegexNotFoundError,
      sanitized_Request,
      sanitize_filename,
+    str_or_none,
      unescapeHTML,
      unified_strdate,
      unified_timestamp,
@@ -108,10 +112,13 @@ class InfoExtractor(object):
                                     for RTMP - RTMP URL,
                                     for HLS - URL of the M3U8 media playlist,
                                     for HDS - URL of the F4M manifest,
-                                   for DASH - URL of the MPD manifest or
-                                              base URL representing the media
-                                              if MPD manifest is parsed from
-                                              a string,
+                                   for DASH
+                                     - HTTP URL to plain file media (in case of
+                                       unfragmented media)
+                                     - URL of the MPD manifest or base URL
+                                       representing the media if MPD manifest
+                                       is parsed froma string (in case of
+                                       fragmented media)
                                     for MSS - URL of the ISM manifest.
                      * manifest_url
                                   The URL of the manifest file in case of
@@ -802,7 +809,7 @@ class InfoExtractor(object):
              fatal=True, encoding=None, data=None, headers={}, query={},
              expected_status=None):
          """
-        Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle).
+        Return a tuple (xml as an compat_etree_Element, URL handle).
  
          See _download_webpage docstring for arguments specification.
          """
@@ -823,7 +830,7 @@ class InfoExtractor(object):
              transform_source=None, fatal=True, encoding=None,
              data=None, headers={}, query={}, expected_status=None):
          """
-        Return the xml as an xml.etree.ElementTree.Element.
+        Return the xml as an compat_etree_Element.
  
          See _download_webpage docstring for arguments specification.
          """
@@ -1454,7 +1461,7 @@ class InfoExtractor(object):
              manifest_url, video_id, 'Downloading f4m manifest',
              'Unable to download f4m manifest',
              # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
-            # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
+            # (see https://github.com/ytdl-org/youtube-dl/issues/6215#issuecomment-121704244)
              transform_source=transform_source,
              fatal=fatal)
  
@@ -1468,7 +1475,7 @@ class InfoExtractor(object):
      def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
                             transform_source=lambda s: fix_xml_ampersands(s).strip(),
                             fatal=True, m3u8_id=None):
-        if not isinstance(manifest, xml.etree.ElementTree.Element) and not fatal:
+        if not isinstance(manifest, compat_etree_Element) and not fatal:
              return []
  
          # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
@@ -1485,7 +1492,7 @@ class InfoExtractor(object):
              manifest_version = '2.0'
              media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
          # Remove unsupported DRM protected media from final formats
-        # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
+        # rendition (see https://github.com/ytdl-org/youtube-dl/issues/8573).
          media_nodes = remove_encrypted_media(media_nodes)
          if not media_nodes:
              return formats
@@ -1615,8 +1622,8 @@ class InfoExtractor(object):
  
          # References:
          # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
-        # 2. https://github.com/rg3/youtube-dl/issues/12211
-        # 3. https://github.com/rg3/youtube-dl/issues/18923
+        # 2. https://github.com/ytdl-org/youtube-dl/issues/12211
+        # 3. https://github.com/ytdl-org/youtube-dl/issues/18923
  
          # We should try extracting formats only from master playlists [1, 4.3.4],
          # i.e. playlists that describe available qualities. On the other hand
@@ -2137,8 +2144,6 @@ class InfoExtractor(object):
                          bandwidth = int_or_none(representation_attrib.get('bandwidth'))
                          f = {
                              'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
-                            # NB: mpd_url may be empty when MPD manifest is parsed from a string
-                            'url': mpd_url or base_url,
                              'manifest_url': mpd_url,
                              'ext': mimetype2ext(mime_type),
                              'width': int_or_none(representation_attrib.get('width')),
@@ -2159,7 +2164,7 @@ class InfoExtractor(object):
                              # First of, % characters outside $...$ templates
                              # must be escaped by doubling for proper processing
                              # by % operator string formatting used further (see
-                            # https://github.com/rg3/youtube-dl/issues/16867).
+                            # https://github.com/ytdl-org/youtube-dl/issues/16867).
                              t = ''
                              in_template = False
                              for c in tmpl:
@@ -2178,7 +2183,7 @@ class InfoExtractor(object):
  
                          # @initialization is a regular template like @media one
                          # so it should be handled just the same way (see
-                        # https://github.com/rg3/youtube-dl/issues/11605)
+                        # https://github.com/ytdl-org/youtube-dl/issues/11605)
                          if 'initialization' in representation_ms_info:
                              initialization_template = prepare_template(
                                  'initialization',
@@ -2264,7 +2269,7 @@ class InfoExtractor(object):
                          elif 'segment_urls' in representation_ms_info:
                              # Segment URLs with no SegmentTimeline
                              # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
-                            # https://github.com/rg3/youtube-dl/pull/14844
+                            # https://github.com/ytdl-org/youtube-dl/pull/14844
                              fragments = []
                              segment_duration = float_or_none(
                                  representation_ms_info['segment_duration'],
@@ -2277,10 +2282,14 @@ class InfoExtractor(object):
                                      fragment['duration'] = segment_duration
                                  fragments.append(fragment)
                              representation_ms_info['fragments'] = fragments
-                        # NB: MPD manifest may contain direct URLs to unfragmented media.
-                        # No fragments key is present in this case.
+                        # If there is a fragments key available then we correctly recognized fragmented media.
+                        # Otherwise we will assume unfragmented media with direct access. Technically, such
+                        # assumption is not necessarily correct since we may simply have no support for
+                        # some forms of fragmented media renditions yet, but for now we'll use this fallback.
                          if 'fragments' in representation_ms_info:
                              f.update({
+                                # NB: mpd_url may be empty when MPD manifest is parsed from a string
+                                'url': mpd_url or base_url,
                                  'fragment_base_url': base_url,
                                  'fragments': [],
                                  'protocol': 'http_dash_segments',
@@ -2291,11 +2300,15 @@ class InfoExtractor(object):
                                      f['url'] = initialization_url
                                  f['fragments'].append({location_key(initialization_url): initialization_url})
                              f['fragments'].extend(representation_ms_info['fragments'])
+                        else:
+                            # Assuming direct URL to unfragmented media.
+                            f['url'] = base_url
+
                          # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation
                          # is not necessarily unique within a Period thus formats with
                          # the same `format_id` are quite possible. There are numerous examples
-                        # of such manifests (see https://github.com/rg3/youtube-dl/issues/15111,
-                        # https://github.com/rg3/youtube-dl/issues/13919)
+                        # of such manifests (see https://github.com/ytdl-org/youtube-dl/issues/15111,
+                        # https://github.com/ytdl-org/youtube-dl/issues/13919)
                          full_info = formats_dict.get(representation_id, {}).copy()
                          full_info.update(f)
                          formats.append(full_info)
@@ -2456,7 +2469,7 @@ class InfoExtractor(object):
          media_tags.extend(re.findall(
              # We only allow video|audio followed by a whitespace or '>'.
              # Allowing more characters may end up in significant slow down (see
-            # https://github.com/rg3/youtube-dl/issues/11979, example URL:
+            # https://github.com/ytdl-org/youtube-dl/issues/11979, example URL:
              # http://www.porntrex.com/maps/videositemap.xml).
              r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
          for media_tag, media_type, media_content in media_tags:
@@ -2472,18 +2485,43 @@ class InfoExtractor(object):
              media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
              if media_content:
                  for source_tag in re.findall(r'<source[^>]+>', media_content):
-                    source_attributes = extract_attributes(source_tag)
-                    src = source_attributes.get('src')
+                    s_attr = extract_attributes(source_tag)
+                    # data-video-src and data-src are non standard but seen
+                    # several times in the wild
+                    src = dict_get(s_attr, ('src', 'data-video-src', 'data-src'))
                      if not src:
                          continue
-                    f = parse_content_type(source_attributes.get('type'))
+                    f = parse_content_type(s_attr.get('type'))
                      is_plain_url, formats = _media_formats(src, media_type, f)
                      if is_plain_url:
-                        # res attribute is not standard but seen several times
-                        # in the wild
+                        # width, height, res, label and title attributes are
+                        # all not standard but seen several times in the wild
+                        labels = [
+                            s_attr.get(lbl)
+                            for lbl in ('label', 'title')
+                            if str_or_none(s_attr.get(lbl))
+                        ]
+                        width = int_or_none(s_attr.get('width'))
+                        height = (int_or_none(s_attr.get('height')) or
+                                  int_or_none(s_attr.get('res')))
+                        if not width or not height:
+                            for lbl in labels:
+                                resolution = parse_resolution(lbl)
+                                if not resolution:
+                                    continue
+                                width = width or resolution.get('width')
+                                height = height or resolution.get('height')
+                        for lbl in labels:
+                            tbr = parse_bitrate(lbl)
+                            if tbr:
+                                break
+                        else:
+                            tbr = None
                          f.update({
-                            'height': int_or_none(source_attributes.get('res')),
-                            'format_id': source_attributes.get('label'),
+                            'width': width,
+                            'height': height,
+                            'tbr': tbr,
+                            'format_id': s_attr.get('label') or s_attr.get('title'),
                          })
                          f.update(formats[0])
                          media_info['formats'].append(f)