[common] add helper method for Wowza Streaming Engine format extraction

[youtube-dl] / youtube_dl / extractor / common.py
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index 8ed16deee86de2d628695efd468347658fce246c..c0002345870a4490cd3d5904d148b3d272813f28 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -674,33 +674,36 @@ class InfoExtractor(object):
                      username = info[0]
                      password = info[2]
                  else:
-                    raise netrc.NetrcParseError('No authenticators for %s' % netrc_machine)
+                    raise netrc.NetrcParseError(
+                        'No authenticators for %s' % netrc_machine)
              except (IOError, netrc.NetrcParseError) as err:
-                self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err))
+                self._downloader.report_warning(
+                    'parsing .netrc: %s' % error_to_compat_str(err))
  
-        return (username, password)
+        return username, password
  
-    def _get_login_info(self):
+    def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
          """
          Get the login info as (username, password)
-        It will look in the netrc file using the _NETRC_MACHINE value
+        First look for the manually specified credentials using username_option
+        and password_option as keys in params dictionary. If no such credentials
+        available look in the netrc file using the netrc_machine or _NETRC_MACHINE
+        value.
          If there's no info available, return (None, None)
          """
          if self._downloader is None:
              return (None, None)
  
-        username = None
-        password = None
          downloader_params = self._downloader.params
  
          # Attempt to use provided username and password or .netrc data
-        if downloader_params.get('username') is not None:
-            username = downloader_params['username']
-            password = downloader_params['password']
+        if downloader_params.get(username_option) is not None:
+            username = downloader_params[username_option]
+            password = downloader_params[password_option]
          else:
-            username, password = self._get_netrc_login_info()
+            username, password = self._get_netrc_login_info(netrc_machine)
  
-        return (username, password)
+        return username, password
  
      def _get_tfa_info(self, note='two-factor verification code'):
          """
@@ -888,16 +891,16 @@ class InfoExtractor(object):
      def _hidden_inputs(html):
          html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
          hidden_inputs = {}
-        for input in re.findall(r'(?i)<input([^>]+)>', html):
-            if not re.search(r'type=(["\'])(?:hidden|submit)\1', input):
+        for input in re.findall(r'(?i)(<input[^>]+>)', html):
+            attrs = extract_attributes(input)
+            if not input:
                  continue
-            name = re.search(r'(?:name|id)=(["\'])(?P<value>.+?)\1', input)
-            if not name:
+            if attrs.get('type') not in ('hidden', 'submit'):
                  continue
-            value = re.search(r'value=(["\'])(?P<value>.*?)\1', input)
-            if not value:
-                continue
-            hidden_inputs[name.group('value')] = value.group('value')
+            name = attrs.get('name') or attrs.get('id')
+            value = attrs.get('value')
+            if name and value is not None:
+                hidden_inputs[name] = value
          return hidden_inputs
  
      def _form_hidden_inputs(self, form_id, html):
@@ -1163,13 +1166,6 @@ class InfoExtractor(object):
                                m3u8_id=None, note=None, errnote=None,
                                fatal=True, live=False):
  
-        formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
-
-        format_url = lambda u: (
-            u
-            if re.match(r'^https?://', u)
-            else compat_urlparse.urljoin(m3u8_url, u))
-
          res = self._download_webpage_handle(
              m3u8_url, video_id,
              note=note or 'Downloading m3u8 information',
@@ -1180,6 +1176,13 @@ class InfoExtractor(object):
          m3u8_doc, urlh = res
          m3u8_url = urlh.geturl()
  
+        formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
+
+        format_url = lambda u: (
+            u
+            if re.match(r'^https?://', u)
+            else compat_urlparse.urljoin(m3u8_url, u))
+
          # We should try extracting formats only from master playlists [1], i.e.
          # playlists that describe available qualities. On the other hand media
          # playlists [2] should be returned as is since they contain just the media
@@ -1201,27 +1204,44 @@ class InfoExtractor(object):
                  'protocol': entry_protocol,
                  'preference': preference,
              }]
-        last_info = None
-        last_media = None
+        last_info = {}
+        last_media = {}
          for line in m3u8_doc.splitlines():
              if line.startswith('#EXT-X-STREAM-INF:'):
                  last_info = parse_m3u8_attributes(line)
              elif line.startswith('#EXT-X-MEDIA:'):
-                last_media = parse_m3u8_attributes(line)
+                media = parse_m3u8_attributes(line)
+                media_type = media.get('TYPE')
+                if media_type in ('VIDEO', 'AUDIO'):
+                    media_url = media.get('URI')
+                    if media_url:
+                        format_id = []
+                        for v in (media.get('GROUP-ID'), media.get('NAME')):
+                            if v:
+                                format_id.append(v)
+                        formats.append({
+                            'format_id': '-'.join(format_id),
+                            'url': format_url(media_url),
+                            'language': media.get('LANGUAGE'),
+                            'vcodec': 'none' if media_type == 'AUDIO' else None,
+                            'ext': ext,
+                            'protocol': entry_protocol,
+                            'preference': preference,
+                        })
+                    else:
+                        # When there is no URI in EXT-X-MEDIA let this tag's
+                        # data be used by regular URI lines below
+                        last_media = media
              elif line.startswith('#') or not line.strip():
                  continue
              else:
-                if last_info is None:
-                    formats.append({'url': format_url(line)})
-                    continue
-                tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
+                tbr = int_or_none(last_info.get('AVERAGE-BANDWIDTH') or last_info.get('BANDWIDTH'), scale=1000)
                  format_id = []
                  if m3u8_id:
                      format_id.append(m3u8_id)
-                last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') not in ('SUBTITLES', 'CLOSED-CAPTIONS') else None
                  # Despite specification does not mention NAME attribute for
                  # EXT-X-STREAM-INF it still sometimes may be present
-                stream_name = last_info.get('NAME') or last_media_name
+                stream_name = last_info.get('NAME') or last_media.get('NAME')
                  # Bandwidth of live streams may differ over time thus making
                  # format_id unpredictable. So it's better to keep provided
                  # format_id intact.
@@ -1252,11 +1272,9 @@ class InfoExtractor(object):
                          'abr': abr,
                      })
                  f.update(parse_codecs(last_info.get('CODECS')))
-                if last_media is not None:
-                    f['m3u8_media'] = last_media
-                    last_media = None
                  formats.append(f)
                  last_info = {}
+                last_media = {}
          return formats
  
      @staticmethod
@@ -1734,7 +1752,7 @@ class InfoExtractor(object):
              media_attributes = extract_attributes(media_tag)
              src = media_attributes.get('src')
              if src:
-                _, formats = _media_formats(src)
+                _, formats = _media_formats(src, media_type)
                  media_info['formats'].extend(formats)
              media_info['thumbnail'] = media_attributes.get('poster')
              if media_content:
@@ -1777,6 +1795,49 @@ class InfoExtractor(object):
              m3u8_id='hls', fatal=False))
          return formats
  
+    def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
+        url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
+        url_base = self._search_regex(r'(?:https?|rtmp|rtsp)(://[^?]+)', url, 'format url')
+        http_base_url = 'http' + url_base
+        formats = []
+        if 'm3u8' not in skip_protocols:
+            formats.extend(self._extract_m3u8_formats(
+                http_base_url + '/playlist.m3u8', video_id, 'mp4',
+                m3u8_entry_protocol, m3u8_id='hls', fatal=False))
+        if 'f4m' not in skip_protocols:
+            formats.extend(self._extract_f4m_formats(
+                http_base_url + '/manifest.f4m',
+                video_id, f4m_id='hds', fatal=False))
+        if re.search(r'(?:/smil:|\.smil)', url_base):
+            if 'dash' not in skip_protocols:
+                formats.extend(self._extract_mpd_formats(
+                    http_base_url + '/manifest.mpd',
+                    video_id, mpd_id='dash', fatal=False))
+            if 'smil' not in skip_protocols:
+                rtmp_formats = self._extract_smil_formats(
+                    http_base_url + '/jwplayer.smil',
+                    video_id, fatal=False)
+                for rtmp_format in rtmp_formats:
+                    rtsp_format = rtmp_format.copy()
+                    rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
+                    del rtsp_format['play_path']
+                    del rtsp_format['ext']
+                    rtsp_format.update({
+                        'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
+                        'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
+                        'protocol': 'rtsp',
+                    })
+                    formats.extend([rtmp_format, rtsp_format])
+        else:
+            for protocol in ('rtmp', 'rtsp'):
+                if protocol not in skip_protocols:
+                    formats.append({
+                        'url': protocol + url_base,
+                        'format_id': protocol,
+                        'protocol': protocol,
+                    })
+        return formats
+
      def _live_title(self, name):
          """ Generate the title for a live video """
          now = datetime.datetime.now()