Merge remote-tracking branch 'drags/yt-feed-loadmore'
authorPhilipp Hagemeister <phihag@phihag.de>
Sat, 13 Sep 2014 05:14:19 +0000 (07:14 +0200)
committerPhilipp Hagemeister <phihag@phihag.de>
Sat, 13 Sep 2014 05:14:19 +0000 (07:14 +0200)
1  2 
youtube_dl/extractor/youtube.py

index c77f09aace1b3c46a870121041b966baa930e7bb,cd35a16207273c3d6d9ccd81722102f998a0ebc1..e28db2b5a57c7208ac61ab90a026b0d26e050ba5
@@@ -1,5 -1,7 +1,5 @@@
  # coding: utf-8
  
 -import errno
 -import io
  import itertools
  import json
  import os.path
@@@ -19,6 -21,7 +19,6 @@@ from ..utils import 
      compat_str,
  
      clean_html,
 -    get_cachedir,
      get_element_by_id,
      get_element_by_attribute,
      ExtractorError,
      unescapeHTML,
      unified_strdate,
      orderedSet,
 -    write_json_file,
      uppercase_escape,
  )
  
  class YoutubeBaseInfoExtractor(InfoExtractor):
      """Provide base functions for Youtube extractors"""
      _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
 +    _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
      _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
      _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
      _NETRC_MACHINE = 'youtube'
              fatal=False))
  
      def _login(self):
 +        """
 +        Attempt to log in to YouTube.
 +        True is returned if successful or skipped.
 +        False is returned if login failed.
 +
 +        If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
 +        """
          (username, password) = self._get_login_info()
          # No authentication to be performed
          if username is None:
              if self._LOGIN_REQUIRED:
                  raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
 -            return False
 +            return True
  
          login_page = self._download_webpage(
              self._LOGIN_URL, None,
@@@ -77,7 -73,6 +77,7 @@@
                  u'Email': username,
                  u'GALX': galx,
                  u'Passwd': password,
 +
                  u'PersistentCookie': u'yes',
                  u'_utf8': u'霱',
                  u'bgresponse': u'js_disabled',
@@@ -93,7 -88,6 +93,7 @@@
                  u'uilel': u'3',
                  u'hl': u'en_US',
          }
 +
          # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
          # chokes on unicode
          login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
              note=u'Logging in', errnote=u'unable to log in', fatal=False)
          if login_results is False:
              return False
 +
 +        if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
 +            raise ExtractorError(u'Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
 +
 +        # Two-Factor
 +        # TODO add SMS and phone call support - these require making a request and then prompting the user
 +
 +        if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
 +            tfa_code = self._get_tfa_info()
 +
 +            if tfa_code is None:
 +                self._downloader.report_warning(u'Two-factor authentication required. Provide it with --twofactor <code>')
 +                self._downloader.report_warning(u'(Note that only TOTP (Google Authenticator App) codes work at this time.)')
 +                return False
 +
 +            # Unlike the first login form, secTok and timeStmp are both required for the TFA form
 +
 +            match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
 +            if match is None:
 +                self._downloader.report_warning(u'Failed to get secTok - did the page structure change?')
 +            secTok = match.group(1)
 +            match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
 +            if match is None:
 +                self._downloader.report_warning(u'Failed to get timeStmp - did the page structure change?')
 +            timeStmp = match.group(1)
 +
 +            tfa_form_strs = {
 +                u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 +                u'smsToken': u'',
 +                u'smsUserPin': tfa_code,
 +                u'smsVerifyPin': u'Verify',
 +
 +                u'PersistentCookie': u'yes',
 +                u'checkConnection': u'',
 +                u'checkedDomains': u'youtube',
 +                u'pstMsg': u'1',
 +                u'secTok': secTok,
 +                u'timeStmp': timeStmp,
 +                u'service': u'youtube',
 +                u'hl': u'en_US',
 +            }
 +            tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in tfa_form_strs.items())
 +            tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
 +
 +            tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
 +            tfa_results = self._download_webpage(
 +                tfa_req, None,
 +                note=u'Submitting TFA code', errnote=u'unable to submit tfa', fatal=False)
 +
 +            if tfa_results is False:
 +                return False
 +
 +            if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
 +                self._downloader.report_warning(u'Two-factor code expired. Please try again, or use a one-use backup code instead.')
 +                return False
 +            if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
 +                self._downloader.report_warning(u'unable to log in - did the page structure change?')
 +                return False
 +            if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
 +                self._downloader.report_warning(u'Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
 +                return False
 +
          if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
              self._downloader.report_warning(u'unable to log in: bad username or password')
              return False
@@@ -199,7 -131,7 +199,7 @@@ class YoutubeIE(YoutubeBaseInfoExtracto
      IE_DESC = u'YouTube.com'
      _VALID_URL = r"""(?x)^
                       (
 -                         (?:https?://|//)?                                    # http(s):// or protocol-independent URL (optional)
 +                         (?:https?://|//)                                    # http(s):// or protocol-independent URL
                           (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
                              (?:www\.)?deturl\.com/www\.youtube\.com/|
                              (?:www\.)?pwnyoutube\.com/|
                               )
                           ))
                           |youtu\.be/                                          # just youtu.be/xxxx
 -                         |https?://(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
 +                         |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
                           )
                       )?                                                       # all until now is optional -> you can pass the naked ID
                       ([0-9A-Za-z_-]{11})                                      # here is it! the YouTube video ID
          '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
  
          # Dash webm audio
 -        '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 48, 'preference': -50},
 +        '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
          '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
  
          # RTMP (unnamed)
                  u"upload_date": u"20121002",
                  u"description": u"test chars:  \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .",
                  u"categories": [u'Science & Technology'],
 +                'like_count': int,
 +                'dislike_count': int,
              }
          },
          {
          """Indicate the download will use the RTMP protocol."""
          self.to_screen(u'RTMP download detected')
  
 -    def _extract_signature_function(self, video_id, player_url, slen):
 +    def _signature_cache_id(self, example_sig):
 +        """ Return a string representation of a signature """
 +        return u'.'.join(compat_str(len(part)) for part in example_sig.split('.'))
 +
 +    def _extract_signature_function(self, video_id, player_url, example_sig):
          id_m = re.match(
              r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
              player_url)
          player_id = id_m.group('id')
  
          # Read from filesystem cache
 -        func_id = '%s_%s_%d' % (player_type, player_id, slen)
 +        func_id = '%s_%s_%s' % (
 +            player_type, player_id, self._signature_cache_id(example_sig))
          assert os.path.basename(func_id) == func_id
 -        cache_dir = get_cachedir(self._downloader.params)
  
 -        cache_enabled = cache_dir is not None
 -        if cache_enabled:
 -            cache_fn = os.path.join(os.path.expanduser(cache_dir),
 -                                    u'youtube-sigfuncs',
 -                                    func_id + '.json')
 -            try:
 -                with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
 -                    cache_spec = json.load(cachef)
 -                return lambda s: u''.join(s[i] for i in cache_spec)
 -            except IOError:
 -                pass  # No cache available
 +        cache_spec = self._downloader.cache.load(u'youtube-sigfuncs', func_id)
 +        if cache_spec is not None:
 +            return lambda s: u''.join(s[i] for i in cache_spec)
  
          if player_type == 'js':
              code = self._download_webpage(
          else:
              assert False, 'Invalid player type %r' % player_type
  
 -        if cache_enabled:
 -            try:
 -                test_string = u''.join(map(compat_chr, range(slen)))
 -                cache_res = res(test_string)
 -                cache_spec = [ord(c) for c in cache_res]
 -                try:
 -                    os.makedirs(os.path.dirname(cache_fn))
 -                except OSError as ose:
 -                    if ose.errno != errno.EEXIST:
 -                        raise
 -                write_json_file(cache_spec, cache_fn)
 -            except Exception:
 -                tb = traceback.format_exc()
 -                self._downloader.report_warning(
 -                    u'Writing cache to %r failed: %s' % (cache_fn, tb))
 +        if cache_spec is None:
 +            test_string = u''.join(map(compat_chr, range(len(example_sig))))
 +            cache_res = res(test_string)
 +            cache_spec = [ord(c) for c in cache_res]
  
 +        self._downloader.cache.store(u'youtube-sigfuncs', func_id, cache_spec)
          return res
  
 -    def _print_sig_code(self, func, slen):
 +    def _print_sig_code(self, func, example_sig):
          def gen_sig_code(idxs):
              def _genslice(start, end, step):
                  starts = u'' if start == 0 else str(start)
              else:
                  yield _genslice(start, i, step)
  
 -        test_string = u''.join(map(compat_chr, range(slen)))
 +        test_string = u''.join(map(compat_chr, range(len(example_sig))))
          cache_res = func(test_string)
          cache_spec = [ord(c) for c in cache_res]
          expr_code = u' + '.join(gen_sig_code(cache_spec))
 -        code = u'if len(s) == %d:\n    return %s\n' % (slen, expr_code)
 +        signature_id_tuple = '(%s)' % (
 +            ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
 +        code = (u'if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
 +                u'    return %s\n') % (signature_id_tuple, expr_code)
          self.to_screen(u'Extracted signature function:\n' + code)
  
      def _parse_sig_js(self, jscode):
          if player_url.startswith(u'//'):
              player_url = u'https:' + player_url
          try:
 -            player_id = (player_url, len(s))
 +            player_id = (player_url, self._signature_cache_id(s))
              if player_id not in self._player_cache:
                  func = self._extract_signature_function(
 -                    video_id, player_url, len(s)
 +                    video_id, player_url, s
                  )
                  self._player_cache[player_id] = func
              func = self._player_cache[player_id]
              if self._downloader.params.get('youtube_print_sig_code'):
 -                self._print_sig_code(func, len(s))
 +                self._print_sig_code(func, s)
              return func(s)
          except Exception as e:
              tb = traceback.format_exc()
              raise ExtractorError(
 -                u'Automatic signature extraction failed: ' + tb, cause=e)
 +                u'Signature extraction failed: ' + tb, cause=e)
  
      def _get_available_subtitles(self, video_id, webpage):
          try:
          sub_lang_list = {}
          for l in lang_list:
              lang = l[1]
 +            if lang in sub_lang_list:
 +                continue
              params = compat_urllib_parse.urlencode({
                  'lang': lang,
                  'v': video_id,
              upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
              upload_date = unified_strdate(upload_date)
  
 -        m_cat_container = get_element_by_id("eow-category", video_webpage)
 +        m_cat_container = self._search_regex(
 +            r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
 +            video_webpage, 'categories', fatal=False)
          if m_cat_container:
              category = self._html_search_regex(
                  r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
              else:
                  video_description = u''
  
 -        def _extract_count(klass):
 +        def _extract_count(count_name):
              count = self._search_regex(
 -                r'class="%s">([\d,]+)</span>' % re.escape(klass),
 -                video_webpage, klass, default=None)
 +                r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
 +                video_webpage, count_name, default=None)
              if count is not None:
                  return int(count.replace(',', ''))
              return None
 -        like_count = _extract_count(u'likes-count')
 -        dislike_count = _extract_count(u'dislikes-count')
 +        like_count = _extract_count(u'like')
 +        dislike_count = _extract_count(u'dislike')
  
          # subtitles
          video_subtitles = self.extract_subtitles(video_id, video_webpage)
              url_map = {}
              for url_data_str in encoded_url_map.split(','):
                  url_data = compat_parse_qs(url_data_str)
 -                if 'itag' in url_data and 'url' in url_data:
 -                    url = url_data['url'][0]
 -                    if 'sig' in url_data:
 -                        url += '&signature=' + url_data['sig'][0]
 -                    elif 's' in url_data:
 -                        encrypted_sig = url_data['s'][0]
 -
 -                        if not age_gate:
 -                            jsplayer_url_json = self._search_regex(
 -                                r'"assets":.+?"js":\s*("[^"]+")',
 -                                video_webpage, u'JS player URL')
 -                            player_url = json.loads(jsplayer_url_json)
 +                if 'itag' not in url_data or 'url' not in url_data:
 +                    continue
 +                format_id = url_data['itag'][0]
 +                url = url_data['url'][0]
 +
 +                if 'sig' in url_data:
 +                    url += '&signature=' + url_data['sig'][0]
 +                elif 's' in url_data:
 +                    encrypted_sig = url_data['s'][0]
 +
 +                    if not age_gate:
 +                        jsplayer_url_json = self._search_regex(
 +                            r'"assets":.+?"js":\s*("[^"]+")',
 +                            video_webpage, u'JS player URL')
 +                        player_url = json.loads(jsplayer_url_json)
 +                    if player_url is None:
 +                        player_url_json = self._search_regex(
 +                            r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
 +                            video_webpage, u'age gate player URL')
 +                        player_url = json.loads(player_url_json)
 +
 +                    if self._downloader.params.get('verbose'):
                          if player_url is None:
 -                            player_url_json = self._search_regex(
 -                                r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
 -                                video_webpage, u'age gate player URL')
 -                            player_url = json.loads(player_url_json)
 -
 -                        if self._downloader.params.get('verbose'):
 -                            if player_url is None:
 -                                player_version = 'unknown'
 -                                player_desc = 'unknown'
 +                            player_version = 'unknown'
 +                            player_desc = 'unknown'
 +                        else:
 +                            if player_url.endswith('swf'):
 +                                player_version = self._search_regex(
 +                                    r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
 +                                    u'flash player', fatal=False)
 +                                player_desc = 'flash player %s' % player_version
                              else:
 -                                if player_url.endswith('swf'):
 -                                    player_version = self._search_regex(
 -                                        r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
 -                                        u'flash player', fatal=False)
 -                                    player_desc = 'flash player %s' % player_version
 -                                else:
 -                                    player_version = self._search_regex(
 -                                        r'html5player-([^/]+?)(?:/html5player)?\.js',
 -                                        player_url,
 -                                        'html5 player', fatal=False)
 -                                    player_desc = u'html5 player %s' % player_version
 -
 -                            parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
 -                            self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
 -                                (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
 -
 -                        signature = self._decrypt_signature(
 -                            encrypted_sig, video_id, player_url, age_gate)
 -                        url += '&signature=' + signature
 -                    if 'ratebypass' not in url:
 -                        url += '&ratebypass=yes'
 -                    url_map[url_data['itag'][0]] = url
 +                                player_version = self._search_regex(
 +                                    r'html5player-([^/]+?)(?:/html5player)?\.js',
 +                                    player_url,
 +                                    'html5 player', fatal=False)
 +                                player_desc = u'html5 player %s' % player_version
 +
 +                        parts_sizes = self._signature_cache_id(encrypted_sig)
 +                        self.to_screen(u'{%s} signature length %s, %s' %
 +                            (format_id, parts_sizes, player_desc))
 +
 +                    signature = self._decrypt_signature(
 +                        encrypted_sig, video_id, player_url, age_gate)
 +                    url += '&signature=' + signature
 +                if 'ratebypass' not in url:
 +                    url += '&ratebypass=yes'
 +                url_map[format_id] = url
              formats = _map_to_format_list(url_map)
          elif video_info.get('hlsvp'):
              manifest_url = video_info['hlsvp'][0]
@@@ -1025,26 -959,21 +1025,26 @@@ class YoutubePlaylistIE(YoutubeBaseInfo
          self._login()
  
      def _ids_to_results(self, ids):
 -        return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
 -                       for vid_id in ids]
 +        return [
 +            self.url_result(vid_id, 'Youtube', video_id=vid_id)
 +            for vid_id in ids]
  
      def _extract_mix(self, playlist_id):
          # The mixes are generated from a a single video
          # the id of the playlist is just 'RD' + video_id
          url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
 -        webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
 +        webpage = self._download_webpage(
 +            url, playlist_id, u'Downloading Youtube mix')
          search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
 -        title_span = (search_title('playlist-title') or
 -            search_title('title long-title') or search_title('title'))
 +        title_span = (
 +            search_title('playlist-title') or
 +            search_title('title long-title') or
 +            search_title('title'))
          title = clean_html(title_span)
 -        video_re = r'''(?x)data-video-username=".*?".*?
 -                       href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id)
 -        ids = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL))
 +        ids = orderedSet(re.findall(
 +            r'''(?xs)data-video-username=".*?".*?
 +                       href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
 +            webpage))
          url_results = self._ids_to_results(ids)
  
          return self.playlist_result(url_results, playlist_id, title)
@@@ -1137,7 -1066,6 +1137,7 @@@ class YoutubeTopListIE(YoutubePlaylistI
              msg = u'Downloading Youtube mix'
              if i > 0:
                  msg += ', retry #%d' % i
 +
              webpage = self._download_webpage(url, title, msg)
              ids = orderedSet(re.findall(video_re, webpage))
              if ids:
@@@ -1397,6 -1325,7 +1397,7 @@@ class YoutubeFeedsInfoExtractor(Youtube
                                            u'%s feed' % self._FEED_NAME,
                                            u'Downloading page %s' % i)
              feed_html = info.get('feed_html') or info.get('content_html')
+             load_more_widget_html = info.get('load_more_widget_html') or feed_html
              m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
              ids = orderedSet(m.group(1) for m in m_ids)
              feed_entries.extend(
                  for video_id in ids)
              mobj = re.search(
                  r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
-                 feed_html)
+                 load_more_widget_html)
              if mobj is None:
                  break
              paging = mobj.group('paging')
          return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
  
 -class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
 -    IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
 -    _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
 -    _FEED_NAME = 'subscriptions'
 -    _PLAYLIST_TITLE = u'Youtube Subscriptions'
 -
  class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
      IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
      _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
@@@ -1442,43 -1377,6 +1443,43 @@@ class YoutubeFavouritesIE(YoutubeBaseIn
          return self.url_result(playlist_id, 'YoutubePlaylist')
  
  
 +class YoutubeSubscriptionsIE(YoutubePlaylistIE):
 +    IE_NAME = u'youtube:subscriptions'
 +    IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
 +    _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
 +
 +    def _real_extract(self, url):
 +        title = u'Youtube Subscriptions'
 +        page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
 +
 +        # The extraction process is the same as for playlists, but the regex
 +        # for the video ids doesn't contain an index
 +        ids = []
 +        more_widget_html = content_html = page
 +
 +        for page_num in itertools.count(1):
 +            matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
 +            new_ids = orderedSet(matches)
 +            ids.extend(new_ids)
 +
 +            mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
 +            if not mobj:
 +                break
 +
 +            more = self._download_json(
 +                'https://youtube.com/%s' % mobj.group('more'), title,
 +                'Downloading page #%s' % page_num,
 +                transform_source=uppercase_escape)
 +            content_html = more['content_html']
 +            more_widget_html = more['load_more_widget_html']
 +
 +        return {
 +            '_type': 'playlist',
 +            'title': title,
 +            'entries': self._ids_to_results(ids),
 +        }
 +
 +
  class YoutubeTruncatedURLIE(InfoExtractor):
      IE_NAME = 'youtube:truncated_url'
      IE_DESC = False  # Do not list