Merge branch 'extract_info_rewrite'

[youtube-dl] / youtube_dl / InfoExtractors.py
diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py

index 3d8145e162ca696389dd20935dfefa4b5b18d131..ae36558d75839f68facb72300efb5b4c22bcd809 100755 (executable)
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -115,7 +115,8 @@ class InfoExtractor(object):
          """ Returns the response handle """
          if note is None:
              note = u'Downloading video webpage'
-        self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
+        if note is not False:
+            self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
          try:
              return compat_urllib_request.urlopen(url_or_request)
          except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
@@ -133,7 +134,37 @@ class InfoExtractor(object):
          else:
              encoding = 'utf-8'
          webpage_bytes = urlh.read()
+        if self._downloader.params.get('dump_intermediate_pages', False):
+            try:
+                url = url_or_request.get_full_url()
+            except AttributeError:
+                url = url_or_request
+            self._downloader.to_screen(u'Dumping request to ' + url)
+            dump = base64.b64encode(webpage_bytes).decode('ascii')
+            self._downloader.to_screen(dump)
          return webpage_bytes.decode(encoding, 'replace')
+        
+    #Methods for following #608
+    #They set the correct value of the '_type' key
+    def video_result(self, video_info):
+        """Returns a video"""
+        video_info['_type'] = 'video'
+        return video_info
+    def url_result(self, url, ie=None):
+        """Returns a url that points to a page that should be processed"""
+        #TODO: ie should be the class used for getting the info
+        video_info = {'_type': 'url',
+                      'url': url}
+        return video_info
+    def playlist_result(self, entries, playlist_id=None, playlist_title=None):
+        """Returns a playlist"""
+        video_info = {'_type': 'playlist',
+                      'entries': entries}
+        if playlist_id:
+            video_info['id'] = playlist_id
+        if playlist_title:
+            video_info['title'] = playlist_title
+        return video_info
  
  
  class YoutubeIE(InfoExtractor):
@@ -463,18 +494,14 @@ class YoutubeIE(InfoExtractor):
          # Get video info
          self.report_video_info_webpage_download(video_id)
          for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
-            video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
+            video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
                      % (video_id, el_type))
-            request = compat_urllib_request.Request(video_info_url)
-            try:
-                video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
-                video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
-                video_info = compat_parse_qs(video_info_webpage)
-                if 'token' in video_info:
-                    break
-            except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-                self._downloader.report_error(u'unable to download video info webpage: %s' % compat_str(err))
-                return
+            video_info_webpage = self._download_webpage(video_info_url, video_id,
+                                    note=False,
+                                    errnote='unable to download video info webpage')
+            video_info = compat_parse_qs(video_info_webpage)
+            if 'token' in video_info:
+                break
          if 'token' not in video_info:
              if 'reason' in video_info:
                  self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
@@ -701,8 +728,7 @@ class MetacafeIE(InfoExtractor):
          # Check if video comes from YouTube
          mobj2 = re.match(r'^yt-(.*)$', video_id)
          if mobj2 is not None:
-            self._downloader.download(['http://www.youtube.com/watch?v=%s' % mobj2.group(1)])
-            return
+            return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1))]
  
          # Retrieve video webpage to extract further information
          request = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
@@ -1130,7 +1156,7 @@ class VimeoIE(InfoExtractor):
          # Extract video description
          video_description = get_element_by_attribute("itemprop", "description", webpage)
          if video_description: video_description = clean_html(video_description)
-        else: video_description = ''
+        else: video_description = u''
  
          # Extract upload date
          video_upload_date = None
@@ -1343,7 +1369,7 @@ class GenericIE(InfoExtractor):
          self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
  
      def _test_redirect(self, url):
-        """Check if it is a redirect, like url shorteners, in case restart chain."""
+        """Check if it is a redirect, like url shorteners, in case return the new url."""
          class HeadRequest(compat_urllib_request.Request):
              def get_method(self):
                  return "HEAD"
@@ -1394,11 +1420,11 @@ class GenericIE(InfoExtractor):
              return False
  
          self.report_following_redirect(new_url)
-        self._downloader.download([new_url])
-        return True
+        return new_url
  
      def _real_extract(self, url):
-        if self._test_redirect(url): return
+        new_url = self._test_redirect(url)
+        if new_url: return [self.url_result(new_url)]
  
          video_id = url.split('/')[-1]
          try:
@@ -1773,9 +1799,13 @@ class YoutubePlaylistIE(InfoExtractor):
                  self._downloader.report_error(u'Invalid JSON in API response: ' + compat_str(err))
                  return
  
-            if not 'feed' in response or not 'entry' in response['feed']:
+            if 'feed' not in response:
                  self._downloader.report_error(u'Got a malformed response from YouTube API')
                  return
+            if 'entry' not in response['feed']:
+                # Number of videos is a multiple of self._MAX_RESULTS
+                break
+
              videos += [ (entry['yt$position']['$t'], entry['content']['src'])
                          for entry in response['feed']['entry']
                          if 'content' in entry ]
@@ -1785,23 +1815,9 @@ class YoutubePlaylistIE(InfoExtractor):
              page_num += 1
  
          videos = [v[1] for v in sorted(videos)]
-        total = len(videos)
  
-        playliststart = self._downloader.params.get('playliststart', 1) - 1
-        playlistend = self._downloader.params.get('playlistend', -1)
-        if playlistend == -1:
-            videos = videos[playliststart:]
-        else:
-            videos = videos[playliststart:playlistend]
-
-        if len(videos) == total:
-            self._downloader.to_screen(u'[youtube] PL %s: Found %i videos' % (playlist_id, total))
-        else:
-            self._downloader.to_screen(u'[youtube] PL %s: Found %i videos, downloading %i' % (playlist_id, total, len(videos)))
-
-        for video in videos:
-            self._downloader.download([video])
-        return
+        url_results = [self.url_result(url) for url in videos]
+        return [self.playlist_result(url_results, playlist_id)]
  
  
  class YoutubeChannelIE(InfoExtractor):
@@ -1851,9 +1867,9 @@ class YoutubeChannelIE(InfoExtractor):
  
          self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
  
-        for id in video_ids:
-            self._downloader.download(['http://www.youtube.com/watch?v=%s' % id])
-        return
+        urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids]
+        url_entries = [self.url_result(url) for url in urls]
+        return [self.playlist_result(url_entries, channel_id)]
  
  
  class YoutubeUserIE(InfoExtractor):
@@ -1923,20 +1939,9 @@ class YoutubeUserIE(InfoExtractor):
  
              pagenum += 1
  
-        all_ids_count = len(video_ids)
-        playliststart = self._downloader.params.get('playliststart', 1) - 1
-        playlistend = self._downloader.params.get('playlistend', -1)
-
-        if playlistend == -1:
-            video_ids = video_ids[playliststart:]
-        else:
-            video_ids = video_ids[playliststart:playlistend]
-
-        self._downloader.to_screen(u"[youtube] user %s: Collected %d video ids (downloading %d of them)" %
-                (username, all_ids_count, len(video_ids)))
-
-        for video_id in video_ids:
-            self._downloader.download(['http://www.youtube.com/watch?v=%s' % video_id])
+        urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
+        url_results = [self.url_result(url) for url in urls]
+        return [self.playlist_result(url_results, playlist_title = username)]
  
  
  class BlipTVUserIE(InfoExtractor):
@@ -2014,20 +2019,12 @@ class BlipTVUserIE(InfoExtractor):
  
              pagenum += 1
  
-        all_ids_count = len(video_ids)
-        playliststart = self._downloader.params.get('playliststart', 1) - 1
-        playlistend = self._downloader.params.get('playlistend', -1)
-
-        if playlistend == -1:
-            video_ids = video_ids[playliststart:]
-        else:
-            video_ids = video_ids[playliststart:playlistend]
-
          self._downloader.to_screen(u"[%s] user %s: Collected %d video ids (downloading %d of them)" %
                  (self.IE_NAME, username, all_ids_count, len(video_ids)))
  
-        for video_id in video_ids:
-            self._downloader.download([u'http://blip.tv/'+video_id])
+        urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
+        url_entries = [self.url_result(url) for url in urls]
+        return [self.playlist_result(url_entries, playlist_title = username)]
  
  
  class DepositFilesIE(InfoExtractor):
@@ -2156,7 +2153,7 @@ class FacebookIE(InfoExtractor):
          url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
          webpage = self._download_webpage(url, video_id)
  
-        BEFORE = '[["allowFullScreen","true"],["allowScriptAccess","always"],["salign","tl"],["scale","noscale"],["wmode","opaque"]].forEach(function(param) {swf.addParam(param[0], param[1]);});\n'
+        BEFORE = '{swf.addParam(param[0], param[1]);});\n'
          AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
          m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
          if not m:
@@ -2164,12 +2161,14 @@ class FacebookIE(InfoExtractor):
          data = dict(json.loads(m.group(1)))
          params_raw = compat_urllib_parse.unquote(data['params'])
          params = json.loads(params_raw)
-        video_url = params['hd_src']
+        video_data = params['video_data'][0]
+        video_url = video_data.get('hd_src')
          if not video_url:
-            video_url = params['sd_src']
+            video_url = video_data['sd_src']
          if not video_url:
              raise ExtractorError(u'Cannot find video URL')
-        video_duration = int(params['video_duration'])
+        video_duration = int(video_data['video_duration'])
+        thumbnail = video_data['thumbnail_src']
  
          m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
          if not m:
@@ -2182,7 +2181,7 @@ class FacebookIE(InfoExtractor):
              'url': video_url,
              'ext': 'mp4',
              'duration': video_duration,
-            'thumbnail': params['thumbnail_src'],
+            'thumbnail': thumbnail,
          }
          return [info]
  
@@ -3697,7 +3696,9 @@ class FunnyOrDieIE(InfoExtractor):
  
          m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
          if not m:
-            self._downloader.trouble(u'Cannot find video title')
+            m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
+            if not m:
+                self._downloader.trouble(u'Cannot find video title')
          title = clean_html(m.group('title'))
  
          m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
@@ -4131,7 +4132,7 @@ class KeekIE(InfoExtractor):
          video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
          thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
          webpage = self._download_webpage(url, video_id)
-        m = re.search(r'<meta property="og:title" content="(?P<title>.+)"', webpage)
+        m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
          title = unescapeHTML(m.group('title'))
          m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
          uploader = clean_html(m.group('uploader'))
@@ -4357,15 +4358,19 @@ class LiveLeakIE(InfoExtractor):
          return [info]
  
  class ARDIE(InfoExtractor):
-    IE_NAME = 'ard'
-    _VALID_URL = r'^(?:http?://)?mediathek\.daserste\.de/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
-    _TITLE = r'<h1 class="boxTopHeadline">(?P<title>.*)</h1>'
+    _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
+    _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
      _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
  
      def _real_extract(self, url):
          # determine video id from url
          m = re.match(self._VALID_URL, url)
-        video_id = m.group('video_id')
+
+        numid = re.search(r'documentId=([0-9]+)', url)
+        if numid:
+            video_id = numid.group(1)
+        else:
+            video_id = m.group('video_id')
  
          # determine title and media streams from webpage
          html = self._download_webpage(url, video_id)
@@ -4377,8 +4382,8 @@ class ARDIE(InfoExtractor):
              return
  
          # choose default media type and highest quality for now
-        stream = max([s for s in streams if int(s["media_type"]) == 0], key=lambda s: int(s["quality"]))
-        #stream = streams[-1]
+        stream = max([s for s in streams if int(s["media_type"]) == 0],
+                     key=lambda s: int(s["quality"]))
  
          # there's two possibilities: RTMP stream or HTTP download
          info = {'id': video_id, 'title': title, 'ext': 'mp4'}