[ign] improve extraction and extract uploader_id

[youtube-dl] / youtube_dl / extractor / youtube.py
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py

index 3a2c7c562452e81b7a7872889d5ccde8ece55d25..b252e36e1162406dedfcc531d7d038e6bd357348 100644 (file)
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -26,6 +26,7 @@ from ..compat import (
  )
  from ..utils import (
      clean_html,
+    encode_dict,
      ExtractorError,
      float_or_none,
      get_element_by_attribute,
@@ -111,10 +112,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
              'hl': 'en_US',
          }
  
-        # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
-        # chokes on unicode
-        login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in login_form_strs.items())
-        login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
+        login_data = compat_urllib_parse.urlencode(encode_dict(login_form_strs)).encode('ascii')
  
          req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
          login_results = self._download_webpage(
@@ -147,8 +145,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
                  'TrustDevice': 'on',
              })
  
-            tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k, v in tfa_form_strs.items())
-            tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
+            tfa_data = compat_urllib_parse.urlencode(encode_dict(tfa_form_strs)).encode('ascii')
  
              tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
              tfa_results = self._download_webpage(
@@ -1304,32 +1301,49 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                  if 'ratebypass' not in url:
                      url += '&ratebypass=yes'
  
-                width = None
-                height = None
-                size_str = url_data.get('size', [''])[0]
-                if size_str.count('x') == 1:
-                    width, height = [int_or_none(x) for x in size_str.split('x')]
-
-                format_url = {
+                # Some itags are not included in DASH manifest thus corresponding formats will
+                # lack metadata (see https://github.com/rg3/youtube-dl/pull/5993).
+                # Trying to extract metadata from url_encoded_fmt_stream_map entry.
+                mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
+                width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
+                dct = {
                      'format_id': format_id,
                      'url': url,
                      'player_url': player_url,
-                    # As of this writing these are only defined for DASH formats:
                      'filesize': int_or_none(url_data.get('clen', [None])[0]),
-                    'tbr': float_or_none(url_data.get('bitrate', [None])[0], scale=1024),
+                    'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000),
                      'width': width,
                      'height': height,
                      'fps': int_or_none(url_data.get('fps', [None])[0]),
+                    'format_note': url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0],
                  }
-
-                # drop Nones so they do not overwrite the defaults from self._formats
-                format_url = dict((k, v) for k, v in format_url.items() if v is not None)
-
-                format_full = self._formats.get(format_id, {}).copy()
-                format_full.update(format_url)
-
-                formats.append(format_full)
-
+                type_ = url_data.get('type', [None])[0]
+                if type_:
+                    type_split = type_.split(';')
+                    kind_ext = type_split[0].split('/')
+                    if len(kind_ext) == 2:
+                        kind, ext = kind_ext
+                        dct['ext'] = ext
+                        if kind in ('audio', 'video'):
+                            codecs = None
+                            for mobj in re.finditer(
+                                    r'(?P<key>[a-zA-Z_-]+)=(?P<quote>["\']?)(?P<val>.+?)(?P=quote)(?:;|$)', type_):
+                                if mobj.group('key') == 'codecs':
+                                    codecs = mobj.group('val')
+                                    break
+                            if codecs:
+                                codecs = codecs.split(',')
+                                if len(codecs) == 2:
+                                    acodec, vcodec = codecs[0], codecs[1]
+                                else:
+                                    acodec, vcodec = (codecs[0], 'none') if kind == 'audio' else ('none', codecs[0])
+                                dct.update({
+                                    'acodec': acodec,
+                                    'vcodec': vcodec,
+                                })
+                if format_id in self._formats:
+                    dct.update(self._formats[format_id])
+                formats.append(dct)
          elif video_info.get('hlsvp'):
              manifest_url = video_info['hlsvp'][0]
              url_map = self._extract_from_m3u8(manifest_url, video_id)
@@ -1640,12 +1654,15 @@ class YoutubeChannelIE(InfoExtractor):
          channel_page = self._download_webpage(
              url + '?view=57', channel_id,
              'Downloading channel page', fatal=False)
-        channel_playlist_id = self._html_search_meta(
-            'channelId', channel_page, 'channel id', default=None)
-        if not channel_playlist_id:
-            channel_playlist_id = self._search_regex(
-                r'data-channel-external-id="([^"]+)"',
-                channel_page, 'channel id', default=None)
+        if channel_page is False:
+            channel_playlist_id = False
+        else:
+            channel_playlist_id = self._html_search_meta(
+                'channelId', channel_page, 'channel id', default=None)
+            if not channel_playlist_id:
+                channel_playlist_id = self._search_regex(
+                    r'data-channel-external-id="([^"]+)"',
+                    channel_page, 'channel id', default=None)
          if channel_playlist_id and channel_playlist_id.startswith('UC'):
              playlist_id = 'UU' + channel_playlist_id[2:]
              return self.url_result(
@@ -1821,8 +1838,8 @@ class YoutubeShowIE(InfoExtractor):
      _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
      IE_NAME = 'youtube:show'
      _TESTS = [{
-        'url': 'http://www.youtube.com/show/airdisasters',
-        'playlist_mincount': 3,
+        'url': 'https://www.youtube.com/show/airdisasters',
+        'playlist_mincount': 5,
          'info_dict': {
              'id': 'airdisasters',
              'title': 'Air Disasters',
@@ -1833,7 +1850,7 @@ class YoutubeShowIE(InfoExtractor):
          mobj = re.match(self._VALID_URL, url)
          playlist_id = mobj.group('id')
          webpage = self._download_webpage(
-            url, playlist_id, 'Downloading show webpage')
+            'https://www.youtube.com/show/%s/playlists' % playlist_id, playlist_id, 'Downloading show webpage')
          # There's one playlist for each season of the show
          m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
          self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
@@ -1956,6 +1973,7 @@ class YoutubeTruncatedURLIE(InfoExtractor):
              annotation_id=annotation_[^&]+|
              x-yt-cl=[0-9]+|
              hl=[^&]*|
+            t=[0-9]+
          )?
          |
              attribution_link\?a=[^&]+
@@ -1978,6 +1996,9 @@ class YoutubeTruncatedURLIE(InfoExtractor):
      }, {
          'url': 'https://www.youtube.com/watch?hl=en-GB',
          'only_matching': True,
+    }, {
+        'url': 'https://www.youtube.com/watch?t=2372',
+        'only_matching': True,
      }]
  
      def _real_extract(self, url):