[youtube] decrypt signature when downloading dash manifest

[youtube-dl] / youtube_dl / extractor / youtube.py
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py

index 54592d174b1b27cbeb55bae783b9549e0ab37dd3..5b0d30ed1f426114bf5047997496ce1c08baa923 100644 (file)
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -34,6 +34,7 @@ from ..utils import (
      unified_strdate,
      orderedSet,
      write_json_file,
+    uppercase_escape,
  )
  
  class YoutubeBaseInfoExtractor(InfoExtractor):
@@ -136,14 +137,15 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
                           (?:https?://|//)?                                    # http(s):// or protocol-independent URL (optional)
                           (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
                              (?:www\.)?deturl\.com/www\.youtube\.com/|
-                            (?:www\.)?pwnyoutube\.com|
+                            (?:www\.)?pwnyoutube\.com/|
+                            (?:www\.)?yourepeat\.com/|
                              tube\.majestyc\.net/|
                              youtube\.googleapis\.com/)                        # the various hostnames, with wildcard subdomains
                           (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
                           (?:                                                  # the various things that can precede the ID:
                               (?:(?:v|embed|e)/)                               # v/ or embed/ or e/
                               |(?:                                             # or the v= param in all its forms
-                                 (?:(?:watch|movie)(?:_popup)?(?:\.php)?)?    # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
+                                 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)?  # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
                                   (?:\?|\#!?)                                  # the params delimiter ? or # or #!
                                   (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
                                   v=
@@ -502,7 +504,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
                  return a % b
  
              m = re.match(
-                r'^(?P<func>[a-zA-Z]+)\((?P<args>[a-z0-9,]+)\)$', expr)
+                r'^(?P<func>[a-zA-Z$]+)\((?P<args>[a-z0-9,]+)\)$', expr)
              if m:
                  fname = m.group('func')
                  if fname not in functions:
@@ -1085,8 +1087,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
              self._downloader.report_warning(err_msg)
              return {}
  
-    def _extract_id(self, url):
-        mobj = re.match(self._VALID_URL, url, re.VERBOSE)
+    @classmethod
+    def extract_id(cls, url):
+        mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
          if mobj is None:
              raise ExtractorError(u'Invalid URL: %s' % url)
          video_id = mobj.group(2)
@@ -1115,7 +1118,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
          mobj = re.search(self._NEXT_URL_RE, url)
          if mobj:
              url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
-        video_id = self._extract_id(url)
+        video_id = self.extract_id(url)
  
          # Get video webpage
          url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
@@ -1363,12 +1366,26 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
              raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
  
          # Look for the DASH manifest
-        dash_manifest_url_lst = video_info.get('dashmpd')
-        if (dash_manifest_url_lst and dash_manifest_url_lst[0] and
-                self._downloader.params.get('youtube_include_dash_manifest', False)):
+        if (self._downloader.params.get('youtube_include_dash_manifest', False)):
              try:
+                # The DASH manifest used needs to be the one from the original video_webpage.
+                # The one found in get_video_info seems to be using different signatures.
+                # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
+                # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
+                # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
+                if age_gate:
+                    dash_manifest_url = video_info.get('dashmpd')[0];
+                else:
+                    x = re.search(r'ytplayer\.config = ({.*});', video_webpage)
+                    x = json.loads(x.group(1));
+                    dash_manifest_url = x['args']['dashmpd']
+                def decrypt_sig(mobj):
+                    s = mobj.group(1)
+                    dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
+                    return '/signature/%s' % dec_s
+                dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
                  dash_doc = self._download_xml(
-                    dash_manifest_url_lst[0], video_id,
+                    dash_manifest_url, video_id,
                      note=u'Downloading DASH manifest',
                      errnote=u'Could not download DASH manifest')
                  for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
@@ -1422,7 +1439,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
  
  class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
      IE_DESC = u'YouTube.com playlists'
-    _VALID_URL = r"""(?:
+    _VALID_URL = r"""(?x)(?:
                          (?:https?://)?
                          (?:\w+\.)?
                          youtube\.com/
@@ -1431,7 +1448,11 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
                             \? (?:.*?&)*? (?:p|a|list)=
                          |  p/
                          )
-                        ((?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,})
+                        (
+                            (?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
+                            # Top tracks, they can also include dots 
+                            |(?:MC)[\w\.]*
+                        )
                          .*
                       |
                          ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
@@ -1441,11 +1462,6 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
      _VIDEO_RE = r'href="/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
      IE_NAME = u'youtube:playlist'
  
-    @classmethod
-    def suitable(cls, url):
-        """Receives a URL and returns True if suitable for this IE."""
-        return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
-
      def _real_initialize(self):
          self._login()
  
@@ -1469,7 +1485,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
  
      def _real_extract(self, url):
          # Extract playlist id
-        mobj = re.match(self._VALID_URL, url, re.VERBOSE)
+        mobj = re.match(self._VALID_URL, url)
          if mobj is None:
              raise ExtractorError(u'Invalid URL: %s' % url)
          playlist_id = mobj.group(1) or mobj.group(2)
@@ -1590,11 +1606,10 @@ class YoutubeChannelIE(InfoExtractor):
              # Download all channel pages using the json-based channel_ajax query
              for pagenum in itertools.count(1):
                  url = self._MORE_PAGES_URL % (pagenum, channel_id)
-                page = self._download_webpage(url, channel_id,
-                                              u'Downloading page #%s' % pagenum)
-    
-                page = json.loads(page)
-    
+                page = self._download_json(
+                    url, channel_id, note=u'Downloading page #%s' % pagenum,
+                    transform_source=uppercase_escape)
+
                  ids_in_page = self.extract_videos_from_page(page['content_html'])
                  video_ids.extend(ids_in_page)
      
@@ -1694,7 +1709,8 @@ class YoutubeSearchIE(SearchInfoExtractor):
              api_response = data['data']
  
              if 'items' not in api_response:
-                raise ExtractorError(u'[youtube] No video results')
+                raise ExtractorError(
+                    u'[youtube] No video results', expected=True)
  
              new_ids = list(video['id'] for video in api_response['items'])
              video_ids += new_ids
@@ -1814,7 +1830,7 @@ class YoutubeTruncatedURLIE(InfoExtractor):
      IE_NAME = 'youtube:truncated_url'
      IE_DESC = False  # Do not list
      _VALID_URL = r'''(?x)
-        (?:https?://)?[^/]+/watch\?feature=[a-z_]+$|
+        (?:https?://)?[^/]+/watch\?(?:feature=[a-z_]+)?$|
          (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
      '''