[limelight] extract http formats

[youtube-dl] / youtube_dl / extractor / pbs.py
diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py

index 75c36a621dd873aab5690587796d2eb903e09fee..f6f423597fe4952427f226fe276e17d2539eaddc 100644 (file)
--- a/youtube_dl/extractor/pbs.py
+++ b/youtube_dl/extractor/pbs.py
@@ -512,13 +512,18 @@ class PBSIE(InfoExtractor):
          if http_url:
              for m3u8_format in m3u8_formats:
                  bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None)
-                if not bitrate:
+                # extract only the formats that we know that they will be available as http format.
+                # https://projects.pbs.org/confluence/display/coveapi/COVE+Video+Specifications
+                if not bitrate or bitrate not in ('400k', '800k', '1200k', '2500k'):
+                    continue
+                f_url = re.sub(r'\d+k|baseline', bitrate, http_url)
+                # This may produce invalid links sometimes (e.g.
+                # http://www.pbs.org/wgbh/frontline/film/suicide-plan)
+                if not self._is_valid_url(f_url, display_id, 'http-%s video' % bitrate):
                      continue
-                if bitrate == '192k':
-                    bitrate = 'baseline'
                  f = m3u8_format.copy()
                  f.update({
-                    'url': re.sub(r'\d+k|baseline', bitrate, http_url),
+                    'url': f_url,
                      'format_id': m3u8_format['format_id'].replace('hls', 'http'),
                      'protocol': 'http',
                  })
@@ -537,6 +542,19 @@ class PBSIE(InfoExtractor):
                  'ext': 'ttml',
                  'url': closed_captions_url,
              }]
+            mobj = re.search(r'/(\d+)_Encoded\.dfxp', closed_captions_url)
+            if mobj:
+                ttml_caption_suffix, ttml_caption_id = mobj.group(0, 1)
+                ttml_caption_id = int(ttml_caption_id)
+                subtitles['en'].extend([{
+                    'url': closed_captions_url.replace(
+                        ttml_caption_suffix, '/%d_Encoded.srt' % (ttml_caption_id + 1)),
+                    'ext': 'srt',
+                }, {
+                    'url': closed_captions_url.replace(
+                        ttml_caption_suffix, '/%d_Encoded.vtt' % (ttml_caption_id + 2)),
+                    'ext': 'vtt',
+                }])
  
          # info['title'] is often incomplete (e.g. 'Full Episode', 'Episode 5', etc)
          # Try turning it to 'program - title' naming scheme if possible