Merge remote-tracking branch 'jaimeMF/yt-playlists'

author Philipp Hagemeister <phihag@phihag.de>

Fri, 22 Nov 2013 19:11:54 +0000 (20:11 +0100)

committer Philipp Hagemeister <phihag@phihag.de>

Fri, 22 Nov 2013 19:11:54 +0000 (20:11 +0100)
author Philipp Hagemeister <phihag@phihag.de>
Fri, 22 Nov 2013 19:11:54 +0000 (20:11 +0100)
committer Philipp Hagemeister <phihag@phihag.de>
Fri, 22 Nov 2013 19:11:54 +0000 (20:11 +0100)
diff --combined youtube_dl/extractor/youtube.py

index 41838237c617f9758054cef6499ef97b17df5e93,c48c0e24f26c1ed51becad42c4cb167ff5198f1d..9b09793eb307b4ca899942e57cfa3dda88ec5caa
--- 1/youtube_dl/extractor/youtube.py
--- 2/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@@ -139,10 -139,10 +139,10 @@@ class YoutubeBaseInfoExtractor(InfoExtr
   
   class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
       IE_DESC = u'YouTube.com'
- -    _VALID_URL = r"""^
+ +    _VALID_URL = r"""(?x)^
                        (
- -                         (?:https?://)?                                       # http(s):// (optional)
- -                         (?:(?:(?:(?:\w+\.)?youtube(?:-nocookie)?\.com/|
+ +                         (?:https?://|//)?                                    # http(s):// or protocol-independent URL (optional)
+ +                         (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
                               tube\.majestyc\.net/|
                               youtube\.googleapis\.com/)                        # the various hostnames, with wildcard subdomains
                            (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
@@@ -363,18 -363,6 +363,18 @@@
                   u"uploader_id": u"justintimberlakeVEVO"
               }
           },
+ +        {
+ +            u"url":  u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
+ +            u"file":  u"yZIXLfi8CZQ.mp4",
+ +            u"note": u"Embed-only video (#1746)",
+ +            u"info_dict": {
+ +                u"upload_date": u"20120608",
+ +                u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
+ +                u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
+ +                u"uploader": u"SET India",
+ +                u"uploader_id": u"setindia"
+ +            }
+ +        },
       ]
   
   
@@@ -382,7 -370,7 +382,7 @@@
       def suitable(cls, url):
           """Receives a URL and returns True if suitable for this IE."""
           if YoutubePlaylistIE.suitable(url): return False
- -        return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
+ +        return re.match(cls._VALID_URL, url) is not None
   
       def __init__(self, *args, **kwargs):
           super(YoutubeIE, self).__init__(*args, **kwargs)
@@@ -1031,8 -1019,6 +1031,8 @@@
           """Turn the encrypted s field into a working signature"""
   
           if player_url is not None:
+ +            if player_url.startswith(u'//'):
+ +                player_url = u'https:' + player_url
               try:
                   player_id = (player_url, len(s))
                   if player_id not in self._player_cache:
@@@ -1112,7 -1098,7 +1112,7 @@@
               params = compat_urllib_parse.urlencode({
                   'lang': lang,
                   'v': video_id,
- -                'fmt': self._downloader.params.get('subtitlesformat'),
+ +                'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
                   'name': l[0].encode('utf-8'),
               })
               url = u'http://www.youtube.com/api/timedtext?' + params
@@@ -1125,7 -1111,7 +1125,7 @@@
       def _get_available_automatic_caption(self, video_id, webpage):
           """We need the webpage for getting the captions url, pass it as an
              argument to speed up the process."""
- -        sub_format = self._downloader.params.get('subtitlesformat')
+ +        sub_format = self._downloader.params.get('subtitlesformat', 'srt')
           self.to_screen(u'%s: Looking for automatic captions' % video_id)
           mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
           err_msg = u'Couldn\'t find automatic captions for %s' % video_id
@@@ -1284,7 -1270,7 +1284,7 @@@
               # We simulate the access to the video from www.youtube.com/v/{video_id}
               # this can be viewed without login into Youtube
               data = compat_urllib_parse.urlencode({'video_id': video_id,
- -                                                  'el': 'embedded',
+ +                                                  'el': 'player_embedded',
                                                     'gl': 'US',
                                                     'hl': 'en',
                                                     'eurl': 'https://youtube.googleapis.com/v/' + video_id,
@@@ -1313,11 -1299,6 +1313,11 @@@
               else:
                   raise ExtractorError(u'"token" parameter not in video info for unknown reason')
   
+ +        if 'view_count' in video_info:
+ +            view_count = int(video_info['view_count'][0])
+ +        else:
+ +            view_count = None
+ +
           # Check for "rental" videos
           if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
               raise ExtractorError(u'"rental" videos not supported')
@@@ -1506,11 -1487,10 +1506,11 @@@
                   'age_limit':    18 if age_gate else 0,
                   'annotations':  video_annotations,
                   'webpage_url': 'https://www.youtube.com/watch?v=%s' % video_id,
+ +                'view_count': view_count,
               })
           return results
   
- class YoutubePlaylistIE(InfoExtractor):
+ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
       IE_DESC = u'YouTube.com playlists'
       _VALID_URL = r"""(?:
                           (?:https?://)?
@@@ -1526,8 -1506,9 +1526,9 @@@
                        |
                           ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
                        )"""
-     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
-     _MAX_RESULTS = 50
+     _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s'
+     _MORE_PAGES_INDICATOR = r'data-link-type="next"'
+     _VIDEO_RE = r'href="/watch\?v=([0-9A-Za-z_-]{11})&amp;'
       IE_NAME = u'youtube:playlist'
   
       @classmethod
@@@ -1535,6 -1516,9 +1536,9 @@@
           """Receives a URL and returns True if suitable for this IE."""
           return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
   
+     def _real_initialize(self):
+         self._login()
+ 
       def _real_extract(self, url):
           # Extract playlist id
           mobj = re.match(self._VALID_URL, url, re.VERBOSE)
@@@ -1552,41 -1536,23 +1556,23 @@@
               else:
                   self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id))
   
-         # Download playlist videos from API
-         videos = []
+         # Extract the video ids from the playlist pages
+         ids = []
   
           for page_num in itertools.count(1):
-             start_index = self._MAX_RESULTS * (page_num - 1) + 1
-             if start_index >= 1000:
-                 self._downloader.report_warning(u'Max number of results reached')
-                 break
-             url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
+             url = self._TEMPLATE_URL % (playlist_id, page_num)
               page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
+             # The ids are duplicated
+             new_ids = orderedSet(re.findall(self._VIDEO_RE, page))
+             ids.extend(new_ids)
   
-             try:
-                 response = json.loads(page)
-             except ValueError as err:
-                 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
- 
-             if 'feed' not in response:
-                 raise ExtractorError(u'Got a malformed response from YouTube API')
-             playlist_title = response['feed']['title']['$t']
-             if 'entry' not in response['feed']:
-                 # Number of videos is a multiple of self._MAX_RESULTS
+             if re.search(self._MORE_PAGES_INDICATOR, page) is None:
                   break
   
-             for entry in response['feed']['entry']:
-                 index = entry['yt$position']['$t']
-                 if 'media$group' in entry and 'yt$videoid' in entry['media$group']:
-                     videos.append((
-                         index,
-                         'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t']
-                     ))
- 
-         videos = [v[1] for v in sorted(videos)]
+         playlist_title = self._og_search_title(page)
   
-         url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
-         return [self.playlist_result(url_results, playlist_id, playlist_title)]
+         url_results = [self.url_result(vid, 'Youtube') for vid in ids]
+         return self.playlist_result(url_results, playlist_id, playlist_title)
   
   
   class YoutubeChannelIE(InfoExtractor):
@@@ -1612,31 -1578,20 +1598,31 @@@
           # Download channel page
           channel_id = mobj.group(1)
           video_ids = []
+ +        url = 'https://www.youtube.com/channel/%s/videos' % channel_id
+ +        channel_page = self._download_webpage(url, channel_id)
+ +        if re.search(r'channel-header-autogenerated-label', channel_page) is not None:
+ +            autogenerated = True
+ +        else:
+ +            autogenerated = False
   
- -        # Download all channel pages using the json-based channel_ajax query
- -        for pagenum in itertools.count(1):
- -            url = self._MORE_PAGES_URL % (pagenum, channel_id)
- -            page = self._download_webpage(url, channel_id,
- -                                          u'Downloading page #%s' % pagenum)
- -
- -            page = json.loads(page)
- -
- -            ids_in_page = self.extract_videos_from_page(page['content_html'])
- -            video_ids.extend(ids_in_page)
- -
- -            if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
- -                break
+ +        if autogenerated:
+ +            # The videos are contained in a single page
+ +            # the ajax pages can't be used, they are empty
+ +            video_ids = self.extract_videos_from_page(channel_page)
+ +        else:
+ +            # Download all channel pages using the json-based channel_ajax query
+ +            for pagenum in itertools.count(1):
+ +                url = self._MORE_PAGES_URL % (pagenum, channel_id)
+ +                page = self._download_webpage(url, channel_id,
+ +                                              u'Downloading page #%s' % pagenum)
+ +    
+ +                page = json.loads(page)
+ +    
+ +                ids_in_page = self.extract_videos_from_page(page['content_html'])
+ +                video_ids.extend(ids_in_page)
+ +    
+ +                if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
+ +                    break
   
           self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
author	Philipp Hagemeister <phihag@phihag.de>
	Fri, 22 Nov 2013 19:11:54 +0000 (20:11 +0100)
committer	Philipp Hagemeister <phihag@phihag.de>
	Fri, 22 Nov 2013 19:11:54 +0000 (20:11 +0100)