[extractor/common] Improve jwplayer formats extraction (closes #13379)
[youtube-dl] / youtube_dl / extractor / common.py
index 2cb55d6af826db72ff3539aa88c3c2f0768e38e6..941385ae2eb3bd844989d19efb86f233c50d45c8 100644 (file)
@@ -245,6 +245,10 @@ class InfoExtractor(object):
                     specified in the URL.
     end_time:       Time in seconds where the reproduction should end, as
                     specified in the URL.
+    chapters:       A list of dictionaries, with the following entries:
+                        * "start_time" - The start time of the chapter in seconds
+                        * "end_time" - The end time of the chapter in seconds
+                        * "title" (optional, string)
 
     The following fields should only be used when the video belongs to some logical
     chapter or section:
@@ -372,7 +376,7 @@ class InfoExtractor(object):
             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
         m = cls._VALID_URL_RE.match(url)
         assert m
-        return m.group('id')
+        return compat_str(m.group('id'))
 
     @classmethod
     def working(cls):
@@ -990,6 +994,7 @@ class InfoExtractor(object):
                 'tbr': int_or_none(e.get('bitrate')),
                 'width': int_or_none(e.get('width')),
                 'height': int_or_none(e.get('height')),
+                'view_count': int_or_none(e.get('interactionCount')),
             })
 
         for e in json_ld:
@@ -1996,6 +2001,12 @@ class InfoExtractor(object):
             compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)
 
     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
+        """
+        Parse formats from ISM manifest.
+        References:
+         1. [MS-SSTR]: Smooth Streaming Protocol,
+            https://msdn.microsoft.com/en-us/library/ff469518.aspx
+        """
         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
             return []
 
@@ -2017,8 +2028,11 @@ class InfoExtractor(object):
                     self.report_warning('%s is not a supported codec' % fourcc)
                     continue
                 tbr = int(track.attrib['Bitrate']) // 1000
-                width = int_or_none(track.get('MaxWidth'))
-                height = int_or_none(track.get('MaxHeight'))
+                # [1] does not mention Width and Height attributes. However,
+                # they're often present while MaxWidth and MaxHeight are
+                # missing, so should be used as fallbacks
+                width = int_or_none(track.get('MaxWidth') or track.get('Width'))
+                height = int_or_none(track.get('MaxHeight') or track.get('Height'))
                 sampling_rate = int_or_none(track.get('SamplingRate'))
 
                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
@@ -2169,7 +2183,7 @@ class InfoExtractor(object):
     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
         formats = []
         hdcore_sign = 'hdcore=3.7.0'
-        f4m_url = re.sub(r'(https?://[^/+])/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
+        f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
         hds_host = hosts.get('hds')
         if hds_host:
             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
@@ -2191,8 +2205,9 @@ class InfoExtractor(object):
 
     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
-        url_base = self._search_regex(r'(?:https?|rtmp|rtsp)(://[^?]+)', url, 'format url')
-        http_base_url = 'http' + url_base
+        url_base = self._search_regex(
+            r'(?:(?:https?|rtmp|rtsp):)?(//[^?]+)', url, 'format url')
+        http_base_url = '%s:%s' % ('http', url_base)
         formats = []
         if 'm3u8' not in skip_protocols:
             formats.extend(self._extract_m3u8_formats(
@@ -2226,7 +2241,7 @@ class InfoExtractor(object):
             for protocol in ('rtmp', 'rtsp'):
                 if protocol not in skip_protocols:
                     formats.append({
-                        'url': protocol + url_base,
+                        'url': '%s:%s' % (protocol, url_base),
                         'format_id': protocol,
                         'protocol': protocol,
                     })
@@ -2313,6 +2328,8 @@ class InfoExtractor(object):
         urls = []
         formats = []
         for source in jwplayer_sources_data:
+            if not isinstance(source, dict):
+                continue
             source_url = self._proto_relative_url(source.get('file'))
             if not source_url:
                 continue