Merge branch 'master' of https://github.com/zx8/youtube-dl into zx8-master

[youtube-dl] / youtube_dl / extractor / youtube.py
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py

index 2c02da2b731241dcabd7f5907136ec0f4594f793..eba699c3ace0ebdb379a457e2a6ceaff8f1f2f41 100644 (file)
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -11,7 +11,6 @@ import time
  import traceback
  
  from .common import InfoExtractor, SearchInfoExtractor
  import traceback
  
  from .common import InfoExtractor, SearchInfoExtractor
-from .subtitles import SubtitlesInfoExtractor
  from ..jsinterp import JSInterpreter
  from ..swfinterp import SWFInterpreter
  from ..compat import (
  from ..jsinterp import JSInterpreter
  from ..swfinterp import SWFInterpreter
  from ..compat import (
@@ -185,7 +184,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
              return
  
  
              return
  
  
-class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
+class YoutubeIE(YoutubeBaseInfoExtractor):
      IE_DESC = 'YouTube.com'
      _VALID_URL = r"""(?x)^
                       (
      IE_DESC = 'YouTube.com'
      _VALID_URL = r"""(?x)^
                       (
@@ -562,10 +561,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
          else:
              assert False, 'Invalid player type %r' % player_type
  
          else:
              assert False, 'Invalid player type %r' % player_type
  
-        if cache_spec is None:
-            test_string = ''.join(map(compat_chr, range(len(example_sig))))
-            cache_res = res(test_string)
-            cache_spec = [ord(c) for c in cache_res]
+        test_string = ''.join(map(compat_chr, range(len(example_sig))))
+        cache_res = res(test_string)
+        cache_spec = [ord(c) for c in cache_res]
  
          self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
          return res
  
          self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
          return res
@@ -649,7 +647,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
              raise ExtractorError(
                  'Signature extraction failed: ' + tb, cause=e)
  
              raise ExtractorError(
                  'Signature extraction failed: ' + tb, cause=e)
  
-    def _get_available_subtitles(self, video_id, webpage):
+    def _get_subtitles(self, video_id, webpage):
          try:
              subs_doc = self._download_xml(
                  'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
          try:
              subs_doc = self._download_xml(
                  'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
@@ -663,23 +661,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
              lang = track.attrib['lang_code']
              if lang in sub_lang_list:
                  continue
              lang = track.attrib['lang_code']
              if lang in sub_lang_list:
                  continue
-            params = compat_urllib_parse.urlencode({
-                'lang': lang,
-                'v': video_id,
-                'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
-                'name': track.attrib['name'].encode('utf-8'),
-            })
-            url = 'https://www.youtube.com/api/timedtext?' + params
-            sub_lang_list[lang] = url
+            sub_formats = []
+            for ext in ['sbv', 'vtt', 'srt']:
+                params = compat_urllib_parse.urlencode({
+                    'lang': lang,
+                    'v': video_id,
+                    'fmt': ext,
+                    'name': track.attrib['name'].encode('utf-8'),
+                })
+                sub_formats.append({
+                    'url': 'https://www.youtube.com/api/timedtext?' + params,
+                    'ext': ext,
+                })
+            sub_lang_list[lang] = sub_formats
          if not sub_lang_list:
              self._downloader.report_warning('video doesn\'t have subtitles')
              return {}
          return sub_lang_list
  
          if not sub_lang_list:
              self._downloader.report_warning('video doesn\'t have subtitles')
              return {}
          return sub_lang_list
  
-    def _get_available_automatic_caption(self, video_id, webpage):
+    def _get_automatic_captions(self, video_id, webpage):
          """We need the webpage for getting the captions url, pass it as an
             argument to speed up the process."""
          """We need the webpage for getting the captions url, pass it as an
             argument to speed up the process."""
-        sub_format = self._downloader.params.get('subtitlesformat', 'srt')
          self.to_screen('%s: Looking for automatic captions' % video_id)
          mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
          err_msg = 'Couldn\'t find automatic captions for %s' % video_id
          self.to_screen('%s: Looking for automatic captions' % video_id)
          mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
          err_msg = 'Couldn\'t find automatic captions for %s' % video_id
@@ -709,14 +711,20 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
              sub_lang_list = {}
              for lang_node in caption_list.findall('target'):
                  sub_lang = lang_node.attrib['lang_code']
              sub_lang_list = {}
              for lang_node in caption_list.findall('target'):
                  sub_lang = lang_node.attrib['lang_code']
-                params = compat_urllib_parse.urlencode({
-                    'lang': original_lang,
-                    'tlang': sub_lang,
-                    'fmt': sub_format,
-                    'ts': timestamp,
-                    'kind': caption_kind,
-                })
-                sub_lang_list[sub_lang] = caption_url + '&' + params
+                sub_formats = []
+                for ext in ['sbv', 'vtt', 'srt']:
+                    params = compat_urllib_parse.urlencode({
+                        'lang': original_lang,
+                        'tlang': sub_lang,
+                        'fmt': ext,
+                        'ts': timestamp,
+                        'kind': caption_kind,
+                    })
+                    sub_formats.append({
+                        'url': caption_url + '&' + params,
+                        'ext': ext,
+                    })
+                sub_lang_list[sub_lang] = sub_formats
              return sub_lang_list
          # An extractor error can be raise by the download process if there are
          # no automatic captions but there are subtitles
              return sub_lang_list
          # An extractor error can be raise by the download process if there are
          # no automatic captions but there are subtitles
@@ -971,10 +979,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
  
          # subtitles
          video_subtitles = self.extract_subtitles(video_id, video_webpage)
  
          # subtitles
          video_subtitles = self.extract_subtitles(video_id, video_webpage)
-
-        if self._downloader.params.get('listsubtitles', False):
-            self._list_available_subtitles(video_id, video_webpage)
-            return
+        automatic_captions = self.extract_automatic_captions(video_id, video_webpage)
  
          if 'length_seconds' not in video_info:
              self._downloader.report_warning('unable to extract video duration')
  
          if 'length_seconds' not in video_info:
              self._downloader.report_warning('unable to extract video duration')
@@ -1123,6 +1128,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
              'description': video_description,
              'categories': video_categories,
              'subtitles': video_subtitles,
              'description': video_description,
              'categories': video_categories,
              'subtitles': video_subtitles,
+            'automatic_captions': automatic_captions,
              'duration': video_duration,
              'age_limit': 18 if age_gate else 0,
              'annotations': video_annotations,
              'duration': video_duration,
              'age_limit': 18 if age_gate else 0,
              'annotations': video_annotations,
@@ -1147,13 +1153,13 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
                          |  p/
                          )
                          (
                          |  p/
                          )
                          (
-                            (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
+                            (?:PL|LL|EC|UU|FL|RD|UL)?[0-9A-Za-z-_]{10,}
                              # Top tracks, they can also include dots
                              |(?:MC)[\w\.]*
                          )
                          .*
                       |
                              # Top tracks, they can also include dots
                              |(?:MC)[\w\.]*
                          )
                          .*
                       |
-                        ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
+                        ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,})
                       )"""
      _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
      _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
                       )"""
      _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
      _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
@@ -1238,7 +1244,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
              for vid_id in ids]
  
      def _extract_mix(self, playlist_id):
              for vid_id in ids]
  
      def _extract_mix(self, playlist_id):
-        # The mixes are generated from a a single video
+        # The mixes are generated from a single video
          # the id of the playlist is just 'RD' + video_id
          url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
          webpage = self._download_webpage(
          # the id of the playlist is just 'RD' + video_id
          url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
          webpage = self._download_webpage(
@@ -1257,27 +1263,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
  
          return self.playlist_result(url_results, playlist_id, title)
  
  
          return self.playlist_result(url_results, playlist_id, title)
  
-    def _real_extract(self, url):
-        # Extract playlist id
-        mobj = re.match(self._VALID_URL, url)
-        if mobj is None:
-            raise ExtractorError('Invalid URL: %s' % url)
-        playlist_id = mobj.group(1) or mobj.group(2)
-
-        # Check if it's a video-specific URL
-        query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
-        if 'v' in query_dict:
-            video_id = query_dict['v'][0]
-            if self._downloader.params.get('noplaylist'):
-                self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
-                return self.url_result(video_id, 'Youtube', video_id=video_id)
-            else:
-                self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
-
-        if playlist_id.startswith('RD'):
-            # Mixes require a custom extraction process
-            return self._extract_mix(playlist_id)
-
+    def _extract_playlist(self, playlist_id):
          url = self._TEMPLATE_URL % playlist_id
          page = self._download_webpage(url, playlist_id)
          more_widget_html = content_html = page
          url = self._TEMPLATE_URL % playlist_id
          page = self._download_webpage(url, playlist_id)
          more_widget_html = content_html = page
@@ -1321,6 +1307,29 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
          url_results = self._ids_to_results(ids)
          return self.playlist_result(url_results, playlist_id, playlist_title)
  
          url_results = self._ids_to_results(ids)
          return self.playlist_result(url_results, playlist_id, playlist_title)
  
+    def _real_extract(self, url):
+        # Extract playlist id
+        mobj = re.match(self._VALID_URL, url)
+        if mobj is None:
+            raise ExtractorError('Invalid URL: %s' % url)
+        playlist_id = mobj.group(1) or mobj.group(2)
+
+        # Check if it's a video-specific URL
+        query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+        if 'v' in query_dict:
+            video_id = query_dict['v'][0]
+            if self._downloader.params.get('noplaylist'):
+                self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
+                return self.url_result(video_id, 'Youtube', video_id=video_id)
+            else:
+                self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
+
+        if playlist_id.startswith('RD') or playlist_id.startswith('UL'):
+            # Mixes require a custom extraction process
+            return self._extract_mix(playlist_id)
+
+        return self._extract_playlist(playlist_id)
+
  
  class YoutubeChannelIE(InfoExtractor):
      IE_DESC = 'YouTube.com channels'
  
  class YoutubeChannelIE(InfoExtractor):
      IE_DESC = 'YouTube.com channels'
@@ -1526,7 +1535,7 @@ class YoutubeSearchURLIE(InfoExtractor):
  
          webpage = self._download_webpage(url, query)
          result_code = self._search_regex(
  
          webpage = self._download_webpage(url, query)
          result_code = self._search_regex(
-            r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
+            r'(?s)<ol[^>]+class="item-section"(.*?)</ol>', webpage, 'result HTML')
  
          part_codes = re.findall(
              r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
  
          part_codes = re.findall(
              r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
@@ -1637,21 +1646,27 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
  
  
  class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
  
  
  class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
+    IE_NAME = 'youtube:recommended'
      IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
      _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
      _FEED_NAME = 'recommended'
      _PLAYLIST_TITLE = 'Youtube Recommended videos'
  
  
      IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
      _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
      _FEED_NAME = 'recommended'
      _PLAYLIST_TITLE = 'Youtube Recommended videos'
  
  
-class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
+class YoutubeWatchLaterIE(YoutubePlaylistIE):
+    IE_NAME = 'youtube:watchlater'
      IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
      IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
-    _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
+    _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater'
      _FEED_NAME = 'watch_later'
      _PLAYLIST_TITLE = 'Youtube Watch Later'
      _PERSONAL_FEED = True
  
      _FEED_NAME = 'watch_later'
      _PLAYLIST_TITLE = 'Youtube Watch Later'
      _PERSONAL_FEED = True
  
+    def _real_extract(self, url):
+        return self._extract_playlist('WL')
+
  
  class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
  
  class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
+    IE_NAME = 'youtube:history'
      IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
      _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
      _FEED_NAME = 'history'
      IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
      _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
      _FEED_NAME = 'history'