Create a function in InfoExtractors that returns the InfoExtractor class with the...

[youtube-dl] / youtube_dl / InfoExtractors.py
diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py

index cf31970ef5a5b778de64a9a021483dcc7df192f8..eeedcf792b74dafd275bffb27ca21f5e8f98f387 100755 (executable)
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -143,6 +143,28 @@ class InfoExtractor(object):
              dump = base64.b64encode(webpage_bytes).decode('ascii')
              self._downloader.to_screen(dump)
          return webpage_bytes.decode(encoding, 'replace')
+        
+    #Methods for following #608
+    #They set the correct value of the '_type' key
+    def video_result(self, video_info):
+        """Returns a video"""
+        video_info['_type'] = 'video'
+        return video_info
+    def url_result(self, url, ie=None):
+        """Returns a url that points to a page that should be processed"""
+        #TODO: ie should be the class used for getting the info
+        video_info = {'_type': 'url',
+                      'url': url}
+        return video_info
+    def playlist_result(self, entries, playlist_id=None, playlist_title=None):
+        """Returns a playlist"""
+        video_info = {'_type': 'playlist',
+                      'entries': entries}
+        if playlist_id:
+            video_info['id'] = playlist_id
+        if playlist_title:
+            video_info['title'] = playlist_title
+        return video_info
  
  
  class YoutubeIE(InfoExtractor):
@@ -706,17 +728,10 @@ class MetacafeIE(InfoExtractor):
          # Check if video comes from YouTube
          mobj2 = re.match(r'^yt-(.*)$', video_id)
          if mobj2 is not None:
-            self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
-            return
+            return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1))]
  
          # Retrieve video webpage to extract further information
-        request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
-        try:
-            self.report_download_webpage(video_id)
-            webpage = compat_urllib_request.urlopen(request).read()
-        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            self._downloader.report_error(u'unable retrieve video webpage: %s' % compat_str(err))
-            return
+        webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
  
          # Extract URL, uploader and title from webpage
          self.report_extraction(video_id)
@@ -741,13 +756,13 @@ class MetacafeIE(InfoExtractor):
              if 'mediaData' not in vardict:
                  self._downloader.report_error(u'unable to extract media URL')
                  return
-            mobj = re.search(r'"mediaURL":"(http.*?)","key":"(.*?)"', vardict['mediaData'][0])
+            mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
              if mobj is None:
                  self._downloader.report_error(u'unable to extract media URL')
                  return
-            mediaURL = mobj.group(1).replace('\\/', '/')
+            mediaURL = mobj.group('mediaURL').replace('\\/', '/')
              video_extension = mediaURL[-3:]
-            video_url = '%s?__gda__=%s' % (mediaURL, mobj.group(2))
+            video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
  
          mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
          if mobj is None:
@@ -1348,7 +1363,7 @@ class GenericIE(InfoExtractor):
          self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
  
      def _test_redirect(self, url):
-        """Check if it is a redirect, like url shorteners, in case restart chain."""
+        """Check if it is a redirect, like url shorteners, in case return the new url."""
          class HeadRequest(compat_urllib_request.Request):
              def get_method(self):
                  return "HEAD"
@@ -1399,11 +1414,11 @@ class GenericIE(InfoExtractor):
              return False
  
          self.report_following_redirect(new_url)
-        self._downloader.download([new_url])
-        return True
+        return new_url
  
      def _real_extract(self, url):
-        if self._test_redirect(url): return
+        new_url = self._test_redirect(url)
+        if new_url: return [self.url_result(new_url)]
  
          video_id = url.split('/')[-1]
          try:
@@ -1778,9 +1793,13 @@ class YoutubePlaylistIE(InfoExtractor):
                  self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
                  return
  
-            if not 'feed' in response or not 'entry' in response['feed']:
+            if 'feed' not in response:
                  self._downloader.report_error(u'Got a malformed response from YouTube API')
                  return
+            if 'entry' not in response['feed']:
+                # Number of videos is a multiple of self._MAX_RESULTS
+                break
+
              videos += [ (entry['yt$position']['$t'], entry['content']['src'])
                          for entry in response['feed']['entry']
                          if 'content' in entry ]
@@ -1790,37 +1809,31 @@ class YoutubePlaylistIE(InfoExtractor):
              page_num += 1
  
          videos = [v[1] for v in sorted(videos)]
-        total = len(videos)
-
-        playliststart = self._downloader.params.get('playliststart', 1) - 1
-        playlistend = self._downloader.params.get('playlistend', -1)
-        if playlistend == -1:
-            videos = videos[playliststart:]
-        else:
-            videos = videos[playliststart:playlistend]
  
-        if len(videos) == total:
-            self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
-        else:
-            self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
-
-        for video in videos:
-            self._downloader.download([video])
-        return
+        url_results = [self.url_result(url) for url in videos]
+        return [self.playlist_result(url_results, playlist_id)]
  
  
  class YoutubeChannelIE(InfoExtractor):
      """Information Extractor for YouTube channels."""
  
-    _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)(?:/.*)?$"
+    _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
      _TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
-    _MORE_PAGES_INDICATOR = u"Next \N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}"
+    _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
+    _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
      IE_NAME = u'youtube:channel'
  
      def report_download_page(self, channel_id, pagenum):
          """Report attempt to download channel page with given number."""
          self._downloader.to_screen(u'[youtube] Channel %s: Downloading page #%s' % (channel_id, pagenum))
  
+    def extract_videos_from_page(self, page):
+        ids_in_page = []
+        for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
+            if mobj.group(1) not in ids_in_page:
+                ids_in_page.append(mobj.group(1))
+        return ids_in_page
+
      def _real_extract(self, url):
          # Extract channel id
          mobj = re.match(self._VALID_URL, url)
@@ -1828,37 +1841,51 @@ class YoutubeChannelIE(InfoExtractor):
              self._downloader.report_error(u'invalid url: %s' % url)
              return
  
-        # Download channel pages
+        # Download channel page
          channel_id = mobj.group(1)
          video_ids = []
          pagenum = 1
  
-        while True:
-            self.report_download_page(channel_id, pagenum)
-            url = self._TEMPLATE_URL % (channel_id, pagenum)
-            request = compat_urllib_request.Request(url)
-            try:
-                page = compat_urllib_request.urlopen(request).read().decode('utf8')
-            except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-                self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
-                return
+        self.report_download_page(channel_id, pagenum)
+        url = self._TEMPLATE_URL % (channel_id, pagenum)
+        request = compat_urllib_request.Request(url)
+        try:
+            page = compat_urllib_request.urlopen(request).read().decode('utf8')
+        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+            self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
+            return
  
-            # Extract video identifiers
-            ids_in_page = []
-            for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&', page):
-                if mobj.group(1) not in ids_in_page:
-                    ids_in_page.append(mobj.group(1))
-            video_ids.extend(ids_in_page)
+        # Extract video identifiers
+        ids_in_page = self.extract_videos_from_page(page)
+        video_ids.extend(ids_in_page)
  
-            if self._MORE_PAGES_INDICATOR not in page:
-                break
-            pagenum = pagenum + 1
+        # Download any subsequent channel pages using the json-based channel_ajax query
+        if self._MORE_PAGES_INDICATOR in page:
+            while True:
+                pagenum = pagenum + 1
+
+                self.report_download_page(channel_id, pagenum)
+                url = self._MORE_PAGES_URL % (pagenum, channel_id)
+                request = compat_urllib_request.Request(url)
+                try:
+                    page = compat_urllib_request.urlopen(request).read().decode('utf8')
+                except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+                    self._downloader.report_error(u'unable to download webpage: %s' % compat_str(err))
+                    return
+
+                page = json.loads(page)
+
+                ids_in_page = self.extract_videos_from_page(page['content_html'])
+                video_ids.extend(ids_in_page)
+
+                if self._MORE_PAGES_INDICATOR  not in page['load_more_widget_html']:
+                    break
  
          self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
  
-        for id in video_ids:
-            self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
-        return
+        urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
+        url_entries = [self.url_result(url) for url in urls]
+        return [self.playlist_result(url_entries, channel_id)]
  
  
  class YoutubeUserIE(InfoExtractor):
@@ -1928,20 +1955,9 @@ class YoutubeUserIE(InfoExtractor):
  
              pagenum += 1
  
-        all_ids_count = len(video_ids)
-        playliststart = self._downloader.params.get('playliststart', 1) - 1
-        playlistend = self._downloader.params.get('playlistend', -1)
-
-        if playlistend == -1:
-            video_ids = video_ids[playliststart:]
-        else:
-            video_ids = video_ids[playliststart:playlistend]
-
-        self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
-                (username, all_ids_count, len(video_ids)))
-
-        for video_id in video_ids:
-            self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
+        urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
+        url_results = [self.url_result(url) for url in urls]
+        return [self.playlist_result(url_results, playlist_title = username)]
  
  
  class BlipTVUserIE(InfoExtractor):
@@ -2019,20 +2035,12 @@ class BlipTVUserIE(InfoExtractor):
  
              pagenum += 1
  
-        all_ids_count = len(video_ids)
-        playliststart = self._downloader.params.get('playliststart', 1) - 1
-        playlistend = self._downloader.params.get('playlistend', -1)
-
-        if playlistend == -1:
-            video_ids = video_ids[playliststart:]
-        else:
-            video_ids = video_ids[playliststart:playlistend]
-
          self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
                  (self.IE_NAME, username, all_ids_count, len(video_ids)))
  
-        for video_id in video_ids:
-            self._downloader.download([u'http://blip.tv/'+video_id])
+        urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
+        url_entries = [self.url_result(url) for url in urls]
+        return [self.playlist_result(url_entries, playlist_title = username)]
  
  
  class DepositFilesIE(InfoExtractor):
@@ -4461,3 +4469,7 @@ def gen_extractors():
          ARDIE(),
          GenericIE()
      ]
+
+def get_info_extractor(ie_name):
+    """Returns the info extractor class with the given ie_name"""
+    return globals()[ie_name+'IE']