Removed conversion from youtube closed caption format to srt since youtube api suppor...

[youtube-dl] / youtube_dl / InfoExtractors.py
diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py

index 53fab690a41c55730b6339fd213b72345e3cd692..e3998fbe88173ae1cb949d2f6256fa0fc06fb530 100755 (executable)
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -228,23 +228,6 @@ class YoutubeIE(InfoExtractor):
          """Indicate the download will use the RTMP protocol."""
          self._downloader.to_screen(u'[youtube] RTMP download detected')
  
-    def _closed_captions_xml_to_srt(self, xml_string):
-        srt = ''
-        texts = re.findall(r'<text start="([\d\.]+)"( dur="([\d\.]+)")?>([^<]+)</text>', xml_string, re.MULTILINE)
-        # TODO parse xml instead of regex
-        for n, (start, dur_tag, dur, caption) in enumerate(texts):
-            if not dur: dur = '4'
-            start = float(start)
-            end = start + float(dur)
-            start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
-            end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
-            caption = unescapeHTML(caption)
-            caption = unescapeHTML(caption) # double cycle, intentional
-            srt += str(n+1) + '\n'
-            srt += start + ' --> ' + end + '\n'
-            srt += caption + '\n\n'
-        return srt
-
      def _extract_subtitles(self, video_id):
          self.report_video_subtitles_download(video_id)
          request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
@@ -268,15 +251,16 @@ class YoutubeIE(InfoExtractor):
              'lang': srt_lang,
              'name': srt_lang_list[srt_lang].encode('utf-8'),
              'v': video_id,
+            'fmt': 'srt',
          })
          url = 'http://www.youtube.com/api/timedtext?' + params
          try:
-            srt_xml = compat_urllib_request.urlopen(url).read().decode('utf-8')
+            srt = compat_urllib_request.urlopen(url).read().decode('utf-8')
          except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
              return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
-        if not srt_xml:
+        if not srt:
              return (u'WARNING: Did not fetch video subtitles', None)
-        return (None, self._closed_captions_xml_to_srt(srt_xml))
+        return (None, srt)
  
      def _print_formats(self, formats):
          print('Available formats:')
@@ -1330,7 +1314,7 @@ class GenericIE(InfoExtractor):
          opener = compat_urllib_request.OpenerDirector()
          for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
                          HTTPMethodFallback, HEADRedirectHandler,
-                        compat_urllib_error.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
+                        compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
              opener.add_handler(handler())
  
          response = opener.open(HeadRequest(url))
@@ -1366,6 +1350,9 @@ class GenericIE(InfoExtractor):
          if mobj is None:
              # Broaden the search a little bit
              mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
+        if mobj is None:
+            # Broaden the search a little bit: JWPlayer JS loader
+            mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
          if mobj is None:
              self._downloader.trouble(u'ERROR: Invalid URL: %s' % url)
              return
@@ -2098,6 +2085,10 @@ class FacebookIE(InfoExtractor):
          params_raw = compat_urllib_parse.unquote(data['params'])
          params = json.loads(params_raw)
          video_url = params['hd_src']
+        if not video_url:
+            video_url = params['sd_src']
+        if not video_url:
+            raise ExtractorError(u'Cannot find video URL')
          video_duration = int(params['video_duration'])
  
          m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
@@ -2233,7 +2224,7 @@ class MyVideoIE(InfoExtractor):
          webpage = self._download_webpage(webpage_url, video_id)
  
          self.report_extraction(video_id)
-        mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/[^.]+\.jpg\' />',
+        mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
                   webpage)
          if mobj is None:
              self._downloader.trouble(u'ERROR: unable to extract media URL')
@@ -3725,13 +3716,13 @@ class YouPornIE(InfoExtractor):
          webpage = self._download_webpage(req, video_id)
  
          # Get the video title
-        result = re.search(r'videoTitleArea">(?P<title>.*)</h1>', webpage)
+        result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
          if result is None:
-            raise ExtractorError(u'ERROR: unable to extract video title')
+            raise ExtractorError(u'Unable to extract video title')
          video_title = result.group('title').strip()
  
          # Get the video date
-        result = re.search(r'Date:</b>(?P<date>.*)</li>', webpage)
+        result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
          if result is None:
              self._downloader.to_stderr(u'WARNING: unable to extract video date')
              upload_date = None
@@ -3739,9 +3730,9 @@ class YouPornIE(InfoExtractor):
              upload_date = result.group('date').strip()
  
          # Get the video uploader
-        result = re.search(r'Submitted:</b>(?P<uploader>.*)</li>', webpage)
+        result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
          if result is None:
-            self._downloader.to_stderr(u'ERROR: unable to extract uploader')
+            self._downloader.to_stderr(u'WARNING: unable to extract uploader')
              video_uploader = None
          else:
              video_uploader = result.group('uploader').strip()