[extractor/common] Support multiple properties in _og_search_property

[youtube-dl] / youtube_dl / extractor / common.py
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index b5fce5de21972f75253c0ae4c3352e4f0775da1f..3b6a5491dbd1f17ef0d056b385955354c82df156 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -44,6 +44,7 @@ from ..utils import (
      sanitized_Request,
      unescapeHTML,
      unified_strdate,
+    unified_timestamp,
      url_basename,
      xpath_element,
      xpath_text,
@@ -54,6 +55,8 @@ from ..utils import (
      update_Request,
      update_url_query,
      parse_m3u8_attributes,
+    extract_attributes,
+    parse_codecs,
  )
  
  
@@ -161,6 +164,7 @@ class InfoExtractor(object):
                          * "height" (optional, int)
                          * "resolution" (optional, string "{width}x{height"},
                                          deprecated)
+                        * "filesize" (optional, int)
      thumbnail:      Full URL to a video thumbnail image.
      description:    Full video description.
      uploader:       Full name of the video uploader.
@@ -723,9 +727,14 @@ class InfoExtractor(object):
                      [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
  
      def _og_search_property(self, prop, html, name=None, **kargs):
+        if not isinstance(prop, (list, tuple)):
+            prop = [prop]
          if name is None:
-            name = 'OpenGraph %s' % prop
-        escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
+            name = 'OpenGraph %s' % prop[0]
+        og_regexes = []
+        for p in prop:
+            og_regexes.extend(self._og_regexes(p))
+        escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
          if escaped is None:
              return None
          return unescapeHTML(escaped)
@@ -803,15 +812,17 @@ class InfoExtractor(object):
          return self._html_search_meta('twitter:player', html,
                                        'twitter card player')
  
-    def _search_json_ld(self, html, video_id, **kwargs):
+    def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
          json_ld = self._search_regex(
              r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
              html, 'JSON-LD', group='json_ld', **kwargs)
          if not json_ld:
              return {}
-        return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True))
+        return self._json_ld(
+            json_ld, video_id, fatal=kwargs.get('fatal', True),
+            expected_type=expected_type)
  
-    def _json_ld(self, json_ld, video_id, fatal=True):
+    def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
          if isinstance(json_ld, compat_str):
              json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
          if not json_ld:
@@ -819,6 +830,8 @@ class InfoExtractor(object):
          info = {}
          if json_ld.get('@context') == 'http://schema.org':
              item_type = json_ld.get('@type')
+            if expected_type is not None and expected_type != item_type:
+                return info
              if item_type == 'TVEpisode':
                  info.update({
                      'episode': unescapeHTML(json_ld.get('name')),
@@ -839,10 +852,16 @@ class InfoExtractor(object):
                  })
              elif item_type == 'VideoObject':
                  info.update({
+                    'url': json_ld.get('contentUrl'),
                      'title': unescapeHTML(json_ld.get('name')),
                      'description': unescapeHTML(json_ld.get('description')),
-                    'upload_date': unified_strdate(json_ld.get('upload_date')),
-                    'url': unescapeHTML(json_ld.get('contentUrl')),
+                    'thumbnail': json_ld.get('thumbnailUrl'),
+                    'duration': parse_duration(json_ld.get('duration')),
+                    'timestamp': unified_timestamp(json_ld.get('uploadDate')),
+                    'filesize': float_or_none(json_ld.get('contentSize')),
+                    'tbr': int_or_none(json_ld.get('bitrate')),
+                    'width': int_or_none(json_ld.get('width')),
+                    'height': int_or_none(json_ld.get('height')),
                  })
          return dict((k, v) for k, v in info.items() if v is not None)
  
@@ -1193,6 +1212,7 @@ class InfoExtractor(object):
                      'url': format_url(line.strip()),
                      'tbr': tbr,
                      'ext': ext,
+                    'fps': float_or_none(last_info.get('FRAME-RATE')),
                      'protocol': entry_protocol,
                      'preference': preference,
                  }
@@ -1201,24 +1221,17 @@ class InfoExtractor(object):
                      width_str, height_str = resolution.split('x')
                      f['width'] = int(width_str)
                      f['height'] = int(height_str)
-                codecs = last_info.get('CODECS')
-                if codecs:
-                    vcodec, acodec = [None] * 2
-                    va_codecs = codecs.split(',')
-                    if len(va_codecs) == 1:
-                        # Audio only entries usually come with single codec and
-                        # no resolution. For more robustness we also check it to
-                        # be mp4 audio.
-                        if not resolution and va_codecs[0].startswith('mp4a'):
-                            vcodec, acodec = 'none', va_codecs[0]
-                        else:
-                            vcodec = va_codecs[0]
-                    else:
-                        vcodec, acodec = va_codecs[:2]
+                # Unified Streaming Platform
+                mobj = re.search(
+                    r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
+                if mobj:
+                    abr, vbr = mobj.groups()
+                    abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
                      f.update({
-                        'acodec': acodec,
-                        'vcodec': vcodec,
+                        'vbr': vbr,
+                        'abr': abr,
                      })
+                f.update(parse_codecs(last_info.get('CODECS')))
                  if last_media is not None:
                      f['m3u8_media'] = last_media
                      last_media = None
@@ -1473,6 +1486,13 @@ class InfoExtractor(object):
              compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict)
  
      def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}):
+        """
+        Parse formats from MPD manifest.
+        References:
+         1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
+            http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
+         2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
+        """
          if mpd_doc.get('type') == 'dynamic':
              return []
  
@@ -1505,8 +1525,16 @@ class InfoExtractor(object):
                          s_e = segment_timeline.findall(_add_ns('S'))
                          if s_e:
                              ms_info['total_number'] = 0
+                            ms_info['s'] = []
                              for s in s_e:
-                                ms_info['total_number'] += 1 + int(s.get('r', '0'))
+                                r = int(s.get('r', 0))
+                                ms_info['total_number'] += 1 + r
+                                ms_info['s'].append({
+                                    't': int(s.get('t', 0)),
+                                    # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
+                                    'd': int(s.attrib['d']),
+                                    'r': r,
+                                })
                      else:
                          timescale = segment_template.get('timescale')
                          if timescale:
@@ -1543,7 +1571,7 @@ class InfoExtractor(object):
                          continue
                      representation_attrib = adaptation_set.attrib.copy()
                      representation_attrib.update(representation.attrib)
-                    # According to page 41 of ISO/IEC 29001-1:2014, @mimeType is mandatory
+                    # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
                      mime_type = representation_attrib['mimeType']
                      content_type = mime_type.split('/')[0]
                      if content_type == 'text':
@@ -1587,16 +1615,40 @@ class InfoExtractor(object):
                                  representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
                              media_template = representation_ms_info['media_template']
                              media_template = media_template.replace('$RepresentationID$', representation_id)
-                            media_template = re.sub(r'\$(Number|Bandwidth)\$', r'%(\1)d', media_template)
-                            media_template = re.sub(r'\$(Number|Bandwidth)%([^$]+)\$', r'%(\1)\2', media_template)
+                            media_template = re.sub(r'\$(Number|Bandwidth|Time)\$', r'%(\1)d', media_template)
+                            media_template = re.sub(r'\$(Number|Bandwidth|Time)%([^$]+)\$', r'%(\1)\2', media_template)
                              media_template.replace('$$', '$')
-                            representation_ms_info['segment_urls'] = [
-                                media_template % {
-                                    'Number': segment_number,
-                                    'Bandwidth': representation_attrib.get('bandwidth')}
-                                for segment_number in range(
-                                    representation_ms_info['start_number'],
-                                    representation_ms_info['total_number'] + representation_ms_info['start_number'])]
+
+                            # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
+                            # can't be used at the same time
+                            if '%(Number' in media_template:
+                                representation_ms_info['segment_urls'] = [
+                                    media_template % {
+                                        'Number': segment_number,
+                                        'Bandwidth': representation_attrib.get('bandwidth'),
+                                    }
+                                    for segment_number in range(
+                                        representation_ms_info['start_number'],
+                                        representation_ms_info['total_number'] + representation_ms_info['start_number'])]
+                            else:
+                                representation_ms_info['segment_urls'] = []
+                                segment_time = 0
+
+                                def add_segment_url():
+                                    representation_ms_info['segment_urls'].append(
+                                        media_template % {
+                                            'Time': segment_time,
+                                            'Bandwidth': representation_attrib.get('bandwidth'),
+                                        }
+                                    )
+
+                                for num, s in enumerate(representation_ms_info['s']):
+                                    segment_time = s.get('t') or segment_time
+                                    add_segment_url()
+                                    for r in range(s.get('r', 0)):
+                                        segment_time += s['d']
+                                        add_segment_url()
+                                    segment_time += s['d']
                          if 'segment_urls' in representation_ms_info:
                              f.update({
                                  'segment_urls': representation_ms_info['segment_urls'],
@@ -1623,6 +1675,62 @@ class InfoExtractor(object):
                          self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
          return formats
  
+    def _parse_html5_media_entries(self, base_url, webpage):
+        def absolute_url(video_url):
+            return compat_urlparse.urljoin(base_url, video_url)
+
+        def parse_content_type(content_type):
+            if not content_type:
+                return {}
+            ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
+            if ctr:
+                mimetype, codecs = ctr.groups()
+                f = parse_codecs(codecs)
+                f['ext'] = mimetype2ext(mimetype)
+                return f
+            return {}
+
+        entries = []
+        for media_tag, media_type, media_content in re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage):
+            media_info = {
+                'formats': [],
+                'subtitles': {},
+            }
+            media_attributes = extract_attributes(media_tag)
+            src = media_attributes.get('src')
+            if src:
+                media_info['formats'].append({
+                    'url': absolute_url(src),
+                    'vcodec': 'none' if media_type == 'audio' else None,
+                })
+            media_info['thumbnail'] = media_attributes.get('poster')
+            if media_content:
+                for source_tag in re.findall(r'<source[^>]+>', media_content):
+                    source_attributes = extract_attributes(source_tag)
+                    src = source_attributes.get('src')
+                    if not src:
+                        continue
+                    f = parse_content_type(source_attributes.get('type'))
+                    f.update({
+                        'url': absolute_url(src),
+                        'vcodec': 'none' if media_type == 'audio' else None,
+                    })
+                    media_info['formats'].append(f)
+                for track_tag in re.findall(r'<track[^>]+>', media_content):
+                    track_attributes = extract_attributes(track_tag)
+                    kind = track_attributes.get('kind')
+                    if not kind or kind == 'subtitles':
+                        src = track_attributes.get('src')
+                        if not src:
+                            continue
+                        lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
+                        media_info['subtitles'].setdefault(lang, []).append({
+                            'url': absolute_url(src),
+                        })
+            if media_info['formats']:
+                entries.append(media_info)
+        return entries
+
      def _live_title(self, name):
          """ Generate the title for a live video """
          now = datetime.datetime.now()
@@ -1683,7 +1791,7 @@ class InfoExtractor(object):
  
          any_restricted = False
          for tc in self.get_testcases(include_onlymatching=False):
-            if 'playlist' in tc:
+            if tc.get('playlist', []):
                  tc = tc['playlist'][0]
              is_restricted = age_restricted(
                  tc.get('info_dict', {}).get('age_limit'), age_limit)