[youtube] Move metadata extraction after video availability check
[youtube-dl] / youtube_dl / extractor / pbs.py
index 16cc667d025514f64a66852d7e4ad0e8952297b8..a28ee17caa5ec19b245bf39cf43e3fa448bc936a 100644 (file)
@@ -187,9 +187,9 @@ class PBSIE(InfoExtractor):
     _VALID_URL = r'''(?x)https?://
         (?:
            # Direct video URL
-           (?:%s)/(?:viralplayer|video)/(?P<id>[0-9]+)/? |
+           (?:%s)/(?:(?:vir|port)alplayer|video)/(?P<id>[0-9]+)(?:[?/]|$) |
            # Article with embedded player (or direct video)
-           (?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) |
+           (?:www\.)?pbs\.org/(?:[^/]+/){1,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) |
            # Player
            (?:video|player)\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)/
         )
@@ -345,6 +345,21 @@ class PBSIE(InfoExtractor):
                 'formats': 'mincount:8',
             },
         },
+        {
+            # https://github.com/rg3/youtube-dl/issues/13801
+            'url': 'https://www.pbs.org/video/pbs-newshour-full-episode-july-31-2017-1501539057/',
+            'info_dict': {
+                'id': '3003333873',
+                'ext': 'mp4',
+                'title': 'PBS NewsHour - full episode July 31, 2017',
+                'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+                'duration': 3265,
+                'thumbnail': r're:^https?://.*\.jpg$',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
         {
             'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true',
             'only_matching': True,
@@ -352,6 +367,10 @@ class PBSIE(InfoExtractor):
         {
             'url': 'http://watch.knpb.org/video/2365616055/',
             'only_matching': True,
+        },
+        {
+            'url': 'https://player.pbs.org/portalplayer/3004638221/?uid=',
+            'only_matching': True,
         }
     ]
     _ERRORS = {
@@ -402,6 +421,7 @@ class PBSIE(InfoExtractor):
                 r'class="coveplayerid">([^<]+)<',                       # coveplayer
                 r'<section[^>]+data-coveid="(\d+)"',                    # coveplayer from http://www.pbs.org/wgbh/frontline/film/real-csi/
                 r'<input type="hidden" id="pbs_video_id_[0-9]+" value="([0-9]+)"/>',  # jwplayer
+                r"(?s)window\.PBS\.playerConfig\s*=\s*{.*?id\s*:\s*'([0-9]+)',",
             ]
 
             media_id = self._search_regex(
@@ -433,6 +453,9 @@ class PBSIE(InfoExtractor):
                 if url:
                     break
 
+            if not url:
+                url = self._og_search_url(webpage)
+
             mobj = re.match(self._VALID_URL, url)
 
         player_id = mobj.group('player_id')
@@ -482,7 +505,7 @@ class PBSIE(InfoExtractor):
             if player:
                 video_info = self._parse_json(
                     self._search_regex(
-                        r'(?s)PBS\.videoData\s*=\s*({.+?});\n',
+                        [r'(?s)PBS\.videoData\s*=\s*({.+?});\n', r'window\.videoBridge\s*=\s*({.+?});'],
                         player, '%s video data' % page, default='{}'),
                     display_id, transform_source=js_to_json, fatal=False)
                 if video_info:
@@ -490,10 +513,14 @@ class PBSIE(InfoExtractor):
                     if not info:
                         info = video_info
                 if not chapters:
-                    for chapter_data in re.findall(r'(?s)chapters\.push\(({.*?})\)', player):
-                        chapter = self._parse_json(chapter_data, video_id, js_to_json, fatal=False)
-                        if not chapter:
-                            continue
+                    raw_chapters = video_info.get('chapters') or []
+                    if not raw_chapters:
+                        for chapter_data in re.findall(r'(?s)chapters\.push\(({.*?})\)', player):
+                            chapter = self._parse_json(chapter_data, video_id, js_to_json, fatal=False)
+                            if not chapter:
+                                continue
+                            raw_chapters.append(chapter)
+                    for chapter in raw_chapters:
                         start_time = float_or_none(chapter.get('start_time'), 1000)
                         duration = float_or_none(chapter.get('duration'), 1000)
                         if start_time is None or duration is None: