Option to dump intermediate pages

[youtube-dl] / youtube_dl / InfoExtractors.py
diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py

index cca7e1b54be0766bdf3086b25ba25a85f2819e98..94e4451ed935312cb8c06a1a827637c2a32fe59b 100755 (executable)
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -115,7 +115,8 @@ class InfoExtractor(object):
          """ Returns the response handle """
          if note is None:
              note = u'Downloading video webpage'
-        self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
+        if note is not False:
+            self._downloader.to_screen(u'[%s] %s: %s' % (self.IE_NAME, video_id, note))
          try:
              return compat_urllib_request.urlopen(url_or_request)
          except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
@@ -133,6 +134,14 @@ class InfoExtractor(object):
          else:
              encoding = 'utf-8'
          webpage_bytes = urlh.read()
+        if self._downloader.params.get('dump_intermediate_pages', False):
+            try:
+                url = url_or_request.get_full_url()
+            except AttributeError:
+                url = url_or_request
+            self._downloader.to_screen(u'Dumping request to ' + url)
+            dump = base64.b64encode(webpage_bytes).decode('ascii')
+            self._downloader.to_screen(dump)
          return webpage_bytes.decode(encoding, 'replace')
  
  
@@ -253,11 +262,11 @@ class YoutubeIE(InfoExtractor):
          try:
              sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
          except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
+            return (u'unable to download video subtitles: %s' % compat_str(err), None)
          sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
          sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
          if not sub_lang_list:
-            return (u'WARNING: video doesn\'t have subtitles', None)
+            return (u'video doesn\'t have subtitles', None)
          return sub_lang_list
  
      def _list_available_subtitles(self, video_id):
@@ -265,6 +274,10 @@ class YoutubeIE(InfoExtractor):
          self.report_video_subtitles_available(video_id, sub_lang_list)
  
      def _request_subtitle(self, sub_lang, sub_name, video_id, format):
+        """
+        Return tuple:
+        (error_message, sub_lang, sub)
+        """
          self.report_video_subtitles_request(video_id, sub_lang, format)
          params = compat_urllib_parse.urlencode({
              'lang': sub_lang,
@@ -276,14 +289,20 @@ class YoutubeIE(InfoExtractor):
          try:
              sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
          except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
+            return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
          if not sub:
-            return (u'WARNING: Did not fetch video subtitles', None)
+            return (u'Did not fetch video subtitles', None, None)
          return (None, sub_lang, sub)
  
      def _extract_subtitle(self, video_id):
+        """
+        Return a list with a tuple:
+        [(error_message, sub_lang, sub)]
+        """
          sub_lang_list = self._get_available_subtitles(video_id)
          sub_format = self._downloader.params.get('subtitlesformat')
+        if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
+            return [(sub_lang_list[0], None, None)]
          if self._downloader.params.get('subtitleslang', False):
              sub_lang = self._downloader.params.get('subtitleslang')
          elif 'en' in sub_lang_list:
@@ -291,7 +310,7 @@ class YoutubeIE(InfoExtractor):
          else:
              sub_lang = list(sub_lang_list.keys())[0]
          if not sub_lang in sub_lang_list:
-            return (u'WARNING: no closed captions found in the specified language "%s"' % sub_lang, None)
+            return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
  
          subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
          return [subtitle]
@@ -299,6 +318,8 @@ class YoutubeIE(InfoExtractor):
      def _extract_all_subtitles(self, video_id):
          sub_lang_list = self._get_available_subtitles(video_id)
          sub_format = self._downloader.params.get('subtitlesformat')
+        if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
+            return [(sub_lang_list[0], None, None)]
          subtitles = []
          for sub_lang in sub_lang_list:
              subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
@@ -451,18 +472,14 @@ class YoutubeIE(InfoExtractor):
          # Get video info
          self.report_video_info_webpage_download(video_id)
          for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
-            video_info_url = ('http://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
+            video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
                      % (video_id, el_type))
-            request = compat_urllib_request.Request(video_info_url)
-            try:
-                video_info_webpage_bytes = compat_urllib_request.urlopen(request).read()
-                video_info_webpage = video_info_webpage_bytes.decode('utf-8', 'ignore')
-                video_info = compat_parse_qs(video_info_webpage)
-                if 'token' in video_info:
-                    break
-            except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-                self._downloader.report_error(u'unable to download video info webpage: %s' % compat_str(err))
-                return
+            video_info_webpage = self._download_webpage(video_info_url, video_id,
+                                    note=False,
+                                    errnote='unable to download video info webpage')
+            video_info = compat_parse_qs(video_info_webpage)
+            if 'token' in video_info:
+                break
          if 'token' not in video_info:
              if 'reason' in video_info:
                  self._downloader.report_error(u'YouTube said: %s' % video_info['reason'][0])
@@ -532,14 +549,14 @@ class YoutubeIE(InfoExtractor):
              if video_subtitles:
                  (sub_error, sub_lang, sub) = video_subtitles[0]
                  if sub_error:
-                    self._downloader.trouble(sub_error)
+                    self._downloader.report_error(sub_error)
  
          if self._downloader.params.get('allsubtitles', False):
              video_subtitles = self._extract_all_subtitles(video_id)
              for video_subtitle in video_subtitles:
                  (sub_error, sub_lang, sub) = video_subtitle
                  if sub_error:
-                    self._downloader.trouble(sub_error)
+                    self._downloader.report_error(sub_error)
  
          if self._downloader.params.get('listsubtitles', False):
              sub_lang_list = self._list_available_subtitles(video_id)
@@ -1118,7 +1135,7 @@ class VimeoIE(InfoExtractor):
          # Extract video description
          video_description = get_element_by_attribute("itemprop", "description", webpage)
          if video_description: video_description = clean_html(video_description)
-        else: video_description = ''
+        else: video_description = u''
  
          # Extract upload date
          video_upload_date = None
@@ -4344,6 +4361,46 @@ class LiveLeakIE(InfoExtractor):
  
          return [info]
  
+class ARDIE(InfoExtractor):
+    _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
+    _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
+    _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
+
+    def _real_extract(self, url):
+        # determine video id from url
+        m = re.match(self._VALID_URL, url)
+
+        numid = re.search(r'documentId=([0-9]+)', url)
+        if numid:
+            video_id = numid.group(1)
+        else:
+            video_id = m.group('video_id')
+
+        # determine title and media streams from webpage
+        html = self._download_webpage(url, video_id)
+        title = re.search(self._TITLE, html).group('title')
+        streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
+        if not streams:
+            assert '"fsk"' in html
+            self._downloader.report_error(u'this video is only available after 8:00 pm')
+            return
+
+        # choose default media type and highest quality for now
+        stream = max([s for s in streams if int(s["media_type"]) == 0],
+                     key=lambda s: int(s["quality"]))
+
+        # there's two possibilities: RTMP stream or HTTP download
+        info = {'id': video_id, 'title': title, 'ext': 'mp4'}
+        if stream['rtmp_url']:
+            self._downloader.to_screen(u'[%s] RTMP download detected' % self.IE_NAME)
+            assert stream['video_url'].startswith('mp4:')
+            info["url"] = stream["rtmp_url"]
+            info["play_path"] = stream['video_url']
+        else:
+            assert stream["video_url"].endswith('.mp4')
+            info["url"] = stream["video_url"]
+        return [info]
+
  
  def gen_extractors():
      """ Return a list of an instance of every supported extractor.
@@ -4397,5 +4454,6 @@ def gen_extractors():
          MySpassIE(),
          SpiegelIE(),
          LiveLeakIE(),
+        ARDIE(),
          GenericIE()
      ]