Merge remote-tracking branch 'jaimeMF/yt-playlists'
authorPhilipp Hagemeister <phihag@phihag.de>
Fri, 22 Nov 2013 19:11:54 +0000 (20:11 +0100)
committerPhilipp Hagemeister <phihag@phihag.de>
Fri, 22 Nov 2013 19:11:54 +0000 (20:11 +0100)
1  2 
youtube_dl/extractor/youtube.py

index 41838237c617f9758054cef6499ef97b17df5e93,c48c0e24f26c1ed51becad42c4cb167ff5198f1d..9b09793eb307b4ca899942e57cfa3dda88ec5caa
@@@ -139,10 -139,10 +139,10 @@@ class YoutubeBaseInfoExtractor(InfoExtr
  
  class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
      IE_DESC = u'YouTube.com'
 -    _VALID_URL = r"""^
 +    _VALID_URL = r"""(?x)^
                       (
 -                         (?:https?://)?                                       # http(s):// (optional)
 -                         (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
 +                         (?:https?://|//)?                                    # http(s):// or protocol-independent URL (optional)
 +                         (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
                              tube\.majestyc\.net/|
                              youtube\.googleapis\.com/)                        # the various hostnames, with wildcard subdomains
                           (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
                  u"uploader_id": u"justintimberlakeVEVO"
              }
          },
 +        {
 +            u"url":  u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
 +            u"file":  u"yZIXLfi8CZQ.mp4",
 +            u"note": u"Embed-only video (#1746)",
 +            u"info_dict": {
 +                u"upload_date": u"20120608",
 +                u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
 +                u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
 +                u"uploader": u"SET India",
 +                u"uploader_id": u"setindia"
 +            }
 +        },
      ]
  
  
      def suitable(cls, url):
          """Receives a URL and returns True if suitable for this IE."""
          if YoutubePlaylistIE.suitable(url): return False
 -        return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
 +        return re.match(cls._VALID_URL, url) is not None
  
      def __init__(self, *args, **kwargs):
          super(YoutubeIE, self).__init__(*args, **kwargs)
          """Turn the encrypted s field into a working signature"""
  
          if player_url is not None:
 +            if player_url.startswith(u'//'):
 +                player_url = u'https:' + player_url
              try:
                  player_id = (player_url, len(s))
                  if player_id not in self._player_cache:
              params = compat_urllib_parse.urlencode({
                  'lang': lang,
                  'v': video_id,
 -                'fmt': self._downloader.params.get('subtitlesformat'),
 +                'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
                  'name': l[0].encode('utf-8'),
              })
              url = u'http://www.youtube.com/api/timedtext?' + params
      def _get_available_automatic_caption(self, video_id, webpage):
          """We need the webpage for getting the captions url, pass it as an
             argument to speed up the process."""
 -        sub_format = self._downloader.params.get('subtitlesformat')
 +        sub_format = self._downloader.params.get('subtitlesformat', 'srt')
          self.to_screen(u'%s: Looking for automatic captions' % video_id)
          mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
          err_msg = u'Couldn\'t find automatic captions for %s' % video_id
              # We simulate the access to the video from www.youtube.com/v/{video_id}
              # this can be viewed without login into Youtube
              data = compat_urllib_parse.urlencode({'video_id': video_id,
 -                                                  'el': 'embedded',
 +                                                  'el': 'player_embedded',
                                                    'gl': 'US',
                                                    'hl': 'en',
                                                    'eurl': 'https://youtube.googleapis.com/v/' + video_id,
              else:
                  raise ExtractorError(u'"token" parameter not in video info for unknown reason')
  
 +        if 'view_count' in video_info:
 +            view_count = int(video_info['view_count'][0])
 +        else:
 +            view_count = None
 +
          # Check for "rental" videos
          if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
              raise ExtractorError(u'"rental" videos not supported')
                  'age_limit':    18 if age_gate else 0,
                  'annotations':  video_annotations,
                  'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
 +                'view_count': view_count,
              })
          return results
  
- class YoutubePlaylistIE(InfoExtractor):
+ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
      IE_DESC = u'YouTube.com playlists'
      _VALID_URL = r"""(?:
                          (?:https?://)?
                       |
                          ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
                       )"""
-     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
-     _MAX_RESULTS = 50
+     _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
+     _MORE_PAGES_INDICATOR = r'data-link-type="next"'
+     _VIDEO_RE = r'href="/watch\?v=([0-9A-Za-z_-]{11})&amp;'
      IE_NAME = u'youtube:playlist'
  
      @classmethod
          """Receives a URL and returns True if suitable for this IE."""
          return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
  
+     def _real_initialize(self):
+         self._login()
      def _real_extract(self, url):
          # Extract playlist id
          mobj = re.match(self._VALID_URL, url, re.VERBOSE)
              else:
                  self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
  
-         # Download playlist videos from API
-         videos = []
+         # Extract the video ids from the playlist pages
+         ids = []
  
          for page_num in itertools.count(1):
-             start_index = self._MAX_RESULTS * (page_num - 1) + 1
-             if start_index >= 1000:
-                 self._downloader.report_warning(u'Max number of results reached')
-                 break
-             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
+             url = self._TEMPLATE_URL % (playlist_id, page_num)
              page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
+             # The ids are duplicated
+             new_ids = orderedSet(re.findall(self._VIDEO_RE, page))
+             ids.extend(new_ids)
  
-             try:
-                 response = json.loads(page)
-             except ValueError as err:
-                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
-             if 'feed' not in response:
-                 raise ExtractorError(u'Got a malformed response from YouTube API')
-             playlist_title = response['feed']['title']['$t']
-             if 'entry' not in response['feed']:
-                 # Number of videos is a multiple of self._MAX_RESULTS
+             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
                  break
  
-             for entry in response['feed']['entry']:
-                 index = entry['yt$position']['$t']
-                 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
-                     videos.append((
-                         index,
-                         'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
-                     ))
-         videos = [v[1] for v in sorted(videos)]
+         playlist_title = self._og_search_title(page)
  
-         url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
-         return [self.playlist_result(url_results, playlist_id, playlist_title)]
+         url_results = [self.url_result(vid, 'Youtube') for vid in ids]
+         return self.playlist_result(url_results, playlist_id, playlist_title)
  
  
  class YoutubeChannelIE(InfoExtractor):
          # Download channel page
          channel_id = mobj.group(1)
          video_ids = []
 +        url = 'https://www.youtube.com/channel/%s/videos' % channel_id
 +        channel_page = self._download_webpage(url, channel_id)
 +        if re.search(r'channel-header-autogenerated-label', channel_page) is not None:
 +            autogenerated = True
 +        else:
 +            autogenerated = False
  
 -        # Download all channel pages using the json-based channel_ajax query
 -        for pagenum in itertools.count(1):
 -            url = self._MORE_PAGES_URL % (pagenum, channel_id)
 -            page = self._download_webpage(url, channel_id,
 -                                          u'Downloading page #%s' % pagenum)
 -
 -            page = json.loads(page)
 -
 -            ids_in_page = self.extract_videos_from_page(page['content_html'])
 -            video_ids.extend(ids_in_page)
 -
 -            if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
 -                break
 +        if autogenerated:
 +            # The videos are contained in a single page
 +            # the ajax pages can't be used, they are empty
 +            video_ids = self.extract_videos_from_page(channel_page)
 +        else:
 +            # Download all channel pages using the json-based channel_ajax query
 +            for pagenum in itertools.count(1):
 +                url = self._MORE_PAGES_URL % (pagenum, channel_id)
 +                page = self._download_webpage(url, channel_id,
 +                                              u'Downloading page #%s' % pagenum)
 +    
 +                page = json.loads(page)
 +    
 +                ids_in_page = self.extract_videos_from_page(page['content_html'])
 +                video_ids.extend(ids_in_page)
 +    
 +                if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
 +                    break
  
          self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))