X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fpbs.py;h=a28ee17caa5ec19b245bf39cf43e3fa448bc936a;hb=f7560859a3e25ccaa74123428d42f821299a2bed;hp=6166dc2adf41a431c79244e01256ab94f0dcb704;hpb=ff99fe529e52b2465f1d973e69df01a6391568d6;p=youtube-dl diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 6166dc2ad..a28ee17ca 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -8,7 +8,9 @@ from ..utils import ( ExtractorError, determine_ext, int_or_none, + float_or_none, js_to_json, + orderedSet, strip_jsonp, strip_or_none, unified_strdate, @@ -185,9 +187,9 @@ class PBSIE(InfoExtractor): _VALID_URL = r'''(?x)https?:// (?: # Direct video URL - (?:%s)/(?:viralplayer|video)/(?P[0-9]+)/? | + (?:%s)/(?:(?:vir|port)alplayer|video)/(?P[0-9]+)(?:[?/]|$) | # Article with embedded player (or direct video) - (?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P[^/]+?)(?:\.html)?/?(?:$|[?\#]) | + (?:www\.)?pbs\.org/(?:[^/]+/){1,5}(?P[^/]+?)(?:\.html)?/?(?:$|[?\#]) | # Player (?:video|player)\.pbs\.org/(?:widget/)?partnerplayer/(?P[^/]+)/ ) @@ -263,6 +265,13 @@ class PBSIE(InfoExtractor): }, 'playlist_count': 2, }, + { + 'url': 'http://www.pbs.org/wgbh/americanexperience/films/great-war/', + 'info_dict': { + 'id': 'great-war', + }, + 'playlist_count': 3, + }, { 'url': 'http://www.pbs.org/wgbh/americanexperience/films/death/player/', 'info_dict': { @@ -336,6 +345,21 @@ class PBSIE(InfoExtractor): 'formats': 'mincount:8', }, }, + { + # https://github.com/rg3/youtube-dl/issues/13801 + 'url': 'https://www.pbs.org/video/pbs-newshour-full-episode-july-31-2017-1501539057/', + 'info_dict': { + 'id': '3003333873', + 'ext': 'mp4', + 'title': 'PBS NewsHour - full episode July 31, 2017', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', + 'duration': 3265, + 'thumbnail': r're:^https?://.*\.jpg$', + }, + 'params': { + 'skip_download': True, + }, + }, { 'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true', 'only_matching': True, @@ -343,6 +367,10 @@ class PBSIE(InfoExtractor): { 'url': 'http://watch.knpb.org/video/2365616055/', 'only_matching': True, + }, + { + 'url': 'https://player.pbs.org/portalplayer/3004638221/?uid=', + 'only_matching': True, } ] _ERRORS = { @@ -381,10 +409,10 @@ class PBSIE(InfoExtractor): # tabbed frontline videos MULTI_PART_REGEXES = ( r']+class="videotab[^"]*"[^>]+vid="(\d+)"', - r']+href=["\']#video-\d+["\'][^>]+data-coveid=["\'](\d+)', + r']+href=["\']#(?:video-|part)\d+["\'][^>]+data-cove[Ii]d=["\'](\d+)', ) for p in MULTI_PART_REGEXES: - tabbed_videos = re.findall(p, webpage) + tabbed_videos = orderedSet(re.findall(p, webpage)) if tabbed_videos: return tabbed_videos, presumptive_id, upload_date, description @@ -393,6 +421,7 @@ class PBSIE(InfoExtractor): r'class="coveplayerid">([^<]+)<', # coveplayer r']+data-coveid="(\d+)"', # coveplayer from http://www.pbs.org/wgbh/frontline/film/real-csi/ r'', # jwplayer + r"(?s)window\.PBS\.playerConfig\s*=\s*{.*?id\s*:\s*'([0-9]+)',", ] media_id = self._search_regex( @@ -424,6 +453,9 @@ class PBSIE(InfoExtractor): if url: break + if not url: + url = self._og_search_url(webpage) + mobj = re.match(self._VALID_URL, url) player_id = mobj.group('player_id') @@ -464,6 +496,7 @@ class PBSIE(InfoExtractor): redirects.append(redirect) redirect_urls.add(redirect_url) + chapters = [] # Player pages may also serve different qualities for page in ('widget/partnerplayer', 'portalplayer'): player = self._download_webpage( @@ -472,13 +505,31 @@ class PBSIE(InfoExtractor): if player: video_info = self._parse_json( self._search_regex( - r'(?s)PBS\.videoData\s*=\s*({.+?});\n', + [r'(?s)PBS\.videoData\s*=\s*({.+?});\n', r'window\.videoBridge\s*=\s*({.+?});'], player, '%s video data' % page, default='{}'), display_id, transform_source=js_to_json, fatal=False) if video_info: extract_redirect_urls(video_info) if not info: info = video_info + if not chapters: + raw_chapters = video_info.get('chapters') or [] + if not raw_chapters: + for chapter_data in re.findall(r'(?s)chapters\.push\(({.*?})\)', player): + chapter = self._parse_json(chapter_data, video_id, js_to_json, fatal=False) + if not chapter: + continue + raw_chapters.append(chapter) + for chapter in raw_chapters: + start_time = float_or_none(chapter.get('start_time'), 1000) + duration = float_or_none(chapter.get('duration'), 1000) + if start_time is None or duration is None: + continue + chapters.append({ + 'start_time': start_time, + 'end_time': start_time + duration, + 'title': chapter.get('title'), + }) formats = [] http_url = None @@ -588,4 +639,5 @@ class PBSIE(InfoExtractor): 'upload_date': upload_date, 'formats': formats, 'subtitles': subtitles, + 'chapters': chapters, }