Merge branch 'brightcove_in_page_embed' of https://github.com/remitamine/youtube...
[youtube-dl] / youtube_dl / extractor / pbs.py
index 7b868d0570092c21e86876b8125cb08a36441b04..8fb9b1849cfd96e5ce21ef2ffcffba200f8ba482 100644 (file)
@@ -8,6 +8,7 @@ from ..utils import (
     ExtractorError,
     determine_ext,
     int_or_none,
+    strip_jsonp,
     unified_strdate,
     US_RATINGS,
 )
@@ -153,6 +154,22 @@ class PBSIE(InfoExtractor):
             'params': {
                 'skip_download': True,  # requires ffmpeg
             },
+        },
+        {
+            # Frontline video embedded via flp2012.js
+            'url': 'http://www.pbs.org/wgbh/pages/frontline/the-atomic-artists',
+            'info_dict': {
+                'id': '2070868960',
+                'display_id': 'the-atomic-artists',
+                'ext': 'mp4',
+                'title': 'FRONTLINE - The Atomic Artists',
+                'description': 'md5:f5bfbefadf421e8bb8647602011caf8e',
+                'duration': 723,
+                'thumbnail': 're:^https?://.*\.jpg$',
+            },
+            'params': {
+                'skip_download': True,  # requires ffmpeg
+            },
         }
     ]
     _ERRORS = {
@@ -191,6 +208,23 @@ class PBSIE(InfoExtractor):
             if media_id:
                 return media_id, presumptive_id, upload_date
 
+            # Fronline video embedded via flp
+            video_id = self._search_regex(
+                r'videoid\s*:\s*"([\d+a-z]{7,})"', webpage, 'videoid', default=None)
+            if video_id:
+                # pkg_id calculation is reverse engineered from
+                # http://www.pbs.org/wgbh/pages/frontline/js/flp2012.js
+                prg_id = self._search_regex(
+                    r'videoid\s*:\s*"([\d+a-z]{7,})"', webpage, 'videoid')[7:]
+                if 'q' in prg_id:
+                    prg_id = prg_id.split('q')[1]
+                prg_id = int(prg_id, 16)
+                getdir = self._download_json(
+                    'http://www.pbs.org/wgbh/pages/frontline/.json/getdir/getdir%d.json' % prg_id,
+                    presumptive_id, 'Downloading getdir JSON',
+                    transform_source=strip_jsonp)
+                return getdir['mid'], presumptive_id, upload_date
+
             for iframe in re.findall(r'(?s)<iframe(.+?)></iframe>', webpage):
                 url = self._search_regex(
                     r'src=(["\'])(?P<url>.+?partnerplayer.+?)\1', iframe,