[youtube] Extract track and artist

[youtube-dl] / youtube_dl / extractor / youtube.py
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py

index 897398d2086aee820966134f29b04c2fcca17596..677907aba39f444444a6b61a411c29f2e29ba9c2 100644 (file)
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -37,6 +37,7 @@ from ..utils import (
      orderedSet,
      parse_codecs,
      parse_duration,
+    qualities,
      remove_quotes,
      remove_start,
      smuggle_url,
@@ -84,7 +85,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
  
          If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
          """
-        (username, password) = self._get_login_info()
+        username, password = self._get_login_info()
          # No authentication to be performed
          if username is None:
              if self._LOGIN_REQUIRED and self._downloader.params.get('cookiefile') is None:
@@ -509,6 +510,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                  'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/IconaPop',
                  'license': 'Standard YouTube License',
                  'creator': 'Icona Pop',
+                'track': 'I Love It (feat. Charli XCX)',
+                'artist': 'Icona Pop',
              }
          },
          {
@@ -527,6 +530,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                  'uploader_url': r're:https?://(?:www\.)?youtube\.com/user/justintimberlakeVEVO',
                  'license': 'Standard YouTube License',
                  'creator': 'Justin Timberlake',
+                'track': 'Tunnel Vision`',
+                'artist': 'Justin Timberlake',
                  'age_limit': 18,
              }
          },
@@ -1764,6 +1769,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
          else:
              video_alt_title = video_creator = None
  
+        def extract_meta(field):
+            return self._html_search_regex(
+                r'<h4[^>]+class="title"[^>]*>\s*%s\s*</h4>\s*<ul[^>]*>\s*<li>(.+?)</li>\s*' % field,
+                video_webpage, field, default=None)
+
+        track = extract_meta('Song')
+        artist = extract_meta('Artist')
+
          m_episode = re.search(
              r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
              video_webpage)
@@ -1815,6 +1828,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
  
          chapters = self._extract_chapters(description_original, video_duration)
  
+        def _extract_filesize(media_url):
+            return int_or_none(self._search_regex(
+                r'\bclen[=/](\d+)', media_url, 'filesize', default=None))
+
          if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
              self.report_rtmp_download()
              formats = [{
@@ -1840,6 +1857,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                                  'width': int_or_none(width_height[0]),
                                  'height': int_or_none(width_height[1]),
                              }
+            q = qualities(['small', 'medium', 'hd720'])
              formats = []
              for url_data_str in encoded_url_map.split(','):
                  url_data = compat_parse_qs(url_data_str)
@@ -1919,13 +1937,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                  mobj = re.search(r'^(?P<width>\d+)[xX](?P<height>\d+)$', url_data.get('size', [''])[0])
                  width, height = (int(mobj.group('width')), int(mobj.group('height'))) if mobj else (None, None)
  
+                filesize = int_or_none(url_data.get(
+                    'clen', [None])[0]) or _extract_filesize(url)
+
+                quality = url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0]
+
                  more_fields = {
-                    'filesize': int_or_none(url_data.get('clen', [None])[0]),
+                    'filesize': filesize,
                      'tbr': float_or_none(url_data.get('bitrate', [None])[0], 1000),
                      'width': width,
                      'height': height,
                      'fps': int_or_none(url_data.get('fps', [None])[0]),
-                    'format_note': url_data.get('quality_label', [None])[0] or url_data.get('quality', [None])[0],
+                    'format_note': quality,
+                    'quality': q(quality),
                  }
                  for key, value in more_fields.items():
                      if value:
@@ -1994,6 +2018,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                      for df in self._extract_mpd_formats(
                              mpd_url, video_id, fatal=dash_mpd_fatal,
                              formats_dict=self._formats):
+                        if not df.get('filesize'):
+                            df['filesize'] = _extract_filesize(df['url'])
                          # Do not overwrite DASH format found in some previous DASH manifest
                          if df['format_id'] not in dash_formats:
                              dash_formats[df['format_id']] = df
@@ -2041,9 +2067,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
              'uploader_url': video_uploader_url,
              'upload_date': upload_date,
              'license': video_license,
-            'creator': video_creator,
+            'creator': video_creator or artist,
              'title': video_title,
-            'alt_title': video_alt_title,
+            'alt_title': video_alt_title or track,
              'thumbnail': video_thumbnail,
              'description': video_description,
              'categories': video_categories,
@@ -2066,6 +2092,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
              'series': series,
              'season_number': season_number,
              'episode_number': episode_number,
+            'track': track,
+            'artist': artist,
          }