[youtube] Extract additional meta data from video description on youtube music videos
[youtube-dl] / youtube_dl / extractor / youtube.py
index 1bc2c27adacb40b2cd5e8ff2fb2e56b92eece38b..438eb5aa7d371f0d0fd9c70b9c8b7d15a8d44737 100644 (file)
@@ -1086,7 +1086,95 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
                 'skip_download': True,
                 'youtube_include_dash_manifest': False,
             },
-        }
+        },
+        {
+            # artist and track fields should return non-null, per issue #20599
+            'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs',
+            'info_dict': {
+                'id': 'MgNrAu2pzNs',
+                'ext': 'mp4',
+                'title': 'Voyeur Girl',
+                'description': 'md5:7ae382a65843d6df2685993e90a8628f',
+                'upload_date': '20190312',
+                'uploader': 'Various Artists - Topic',
+                'uploader_id': 'UCVWKBi1ELZn0QX2CBLSkiyw',
+                'artist': 'Stephen',
+                'track': 'Voyeur Girl',
+                'album': 'it\'s too much love to know my dear',
+                'release_date': '20190313',
+                'release_year': 2019,
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            # Retrieve 'artist' field from 'Artist:' in video description
+            # when it is present on youtube music video
+            # Some videos have release_date and no release_year -
+            # (release_year should be extracted from release_date)
+            # https://github.com/ytdl-org/youtube-dl/pull/20742#issuecomment-485740932
+            'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY',
+            'info_dict': {
+                'id': 'k0jLE7tTwjY',
+                'ext': 'mp4',
+                'title': 'Latch Feat. Sam Smith',
+                'description': 'md5:3cb1e8101a7c85fcba9b4fb41b951335',
+                'upload_date': '20150110',
+                'uploader': 'Various Artists - Topic',
+                'uploader_id': 'UCNkEcmYdjrH4RqtNgh7BZ9w',
+                'artist': 'Disclosure',
+                'track': 'Latch Feat. Sam Smith',
+                'album': 'Latch Featuring Sam Smith',
+                'release_date': '20121008',
+                'release_year': 2012,
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            # handle multiple artists on youtube music video
+            'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA',
+            'info_dict': {
+                'id': '74qn0eJSjpA',
+                'ext': 'mp4',
+                'title': 'Eastside',
+                'description': 'md5:290516bb73dcbfab0dcc4efe6c3de5f2',
+                'upload_date': '20180710',
+                'uploader': 'Benny Blanco - Topic',
+                'uploader_id': 'UCzqz_ksRu_WkIzmivMdIS7A',
+                'artist': 'benny blanco, Halsey, Khalid',
+                'track': 'Eastside',
+                'album': 'Eastside',
+                'release_date': '20180713',
+                'release_year': 2018,
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
+        {
+            # handle youtube music video with release_year and no release_date
+            'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M',
+            'info_dict': {
+                'id': '-hcAI0g-f5M',
+                'ext': 'mp4',
+                'title': 'Put It On Me',
+                'description': 'md5:93c55acc682ae7b0c668f2e34e1c069e',
+                'upload_date': '20180426',
+                'uploader': 'Matt Maeson - Topic',
+                'uploader_id': 'UCnEkIGqtGcQMLk73Kp-Q5LQ',
+                'artist': 'Matt Maeson',
+                'track': 'Put It On Me',
+                'album': 'The Hearse',
+                'release_date': None,
+                'release_year': 2018,
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
     ]
 
     def __init__(self, *args, **kwargs):
@@ -2073,6 +2161,36 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
 
         track = extract_meta('Song')
         artist = extract_meta('Artist')
+        album = None
+        release_date = None
+        release_year = None
+
+        description_info = video_description.split('\n\n')
+        # If the description of the video has the youtube music auto-generated format, extract additional info
+        if len(description_info) >= 5 and description_info[-1] == 'Auto-generated by YouTube.':
+            track_artist = description_info[1].split(' · ')
+            if len(track_artist) >= 2:
+                if track is None:
+                    track = track_artist[0]
+                if artist is None:
+                    artist = re.search(r'Artist: ([^\n]+)', description_info[-2])
+                    if artist:
+                        artist = artist.group(1)
+                    if artist is None:
+                        artist = track_artist[1]
+                        # handle multiple artists
+                        if len(track_artist) > 2:
+                            for i in range(2, len(track_artist)):
+                                artist += ', %s' % track_artist[i]
+            release_year = re.search(r'℗ ([0-9]+)', video_description)
+            if release_year:
+                release_year = int_or_none(release_year.group(1))
+            album = description_info[2]
+            if description_info[4].startswith('Released on: '):
+                release_date = description_info[4].split(': ')[1].replace('-', '')
+                # extract release_year from release_date if necessary
+                if release_year is None:
+                    release_year = int_or_none(release_date[0:4])
 
         m_episode = re.search(
             r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>',
@@ -2226,6 +2344,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
             'episode_number': episode_number,
             'track': track,
             'artist': artist,
+            'album': album,
+            'release_date': release_date,
+            'release_year': release_year,
         }