X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Fpbs.py;h=16cc667d025514f64a66852d7e4ad0e8952297b8;hb=9cafc3fd8b54b9b91a145cddf9e4db0bd59e1b5f;hp=f1c0cd068dc12aed21023647fef5b8c83374567b;hpb=5aaf012a4eacc50bb5b131f6c26027e391fc379a;p=youtube-dl diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index f1c0cd068..16cc667d0 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -8,7 +8,9 @@ from ..utils import ( ExtractorError, determine_ext, int_or_none, + float_or_none, js_to_json, + orderedSet, strip_jsonp, strip_or_none, unified_strdate, @@ -193,6 +195,8 @@ class PBSIE(InfoExtractor): ) ''' % '|'.join(list(zip(*_STATIONS))[0]) + _GEO_COUNTRIES = ['US'] + _TESTS = [ { 'url': 'http://www.pbs.org/tpt/constitution-usa-peter-sagal/watch/a-more-perfect-union/', @@ -236,7 +240,7 @@ class PBSIE(InfoExtractor): 'title': 'Great Performances - Dudamel Conducts Verdi Requiem at the Hollywood Bowl - Full', 'description': 'md5:657897370e09e2bc6bf0f8d2cd313c6b', 'duration': 6559, - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, }, { @@ -249,7 +253,7 @@ class PBSIE(InfoExtractor): 'description': 'md5:c741d14e979fc53228c575894094f157', 'title': 'NOVA - Killer Typhoon', 'duration': 3172, - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20140122', 'age_limit': 10, }, @@ -261,6 +265,13 @@ class PBSIE(InfoExtractor): }, 'playlist_count': 2, }, + { + 'url': 'http://www.pbs.org/wgbh/americanexperience/films/great-war/', + 'info_dict': { + 'id': 'great-war', + }, + 'playlist_count': 3, + }, { 'url': 'http://www.pbs.org/wgbh/americanexperience/films/death/player/', 'info_dict': { @@ -270,7 +281,7 @@ class PBSIE(InfoExtractor): 'title': 'American Experience - Death and the Civil War, Chapter 1', 'description': 'md5:67fa89a9402e2ee7d08f53b920674c18', 'duration': 682, - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, 'params': { 'skip_download': True, # requires ffmpeg @@ -286,7 +297,7 @@ class PBSIE(InfoExtractor): 'title': 'FRONTLINE - United States of Secrets (Part One)', 'description': 'md5:55756bd5c551519cc4b7703e373e217e', 'duration': 6851, - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, }, { @@ -302,7 +313,7 @@ class PBSIE(InfoExtractor): 'title': "A Chef's Life - Season 3, Ep. 5: Prickly Business", 'description': 'md5:c0ff7475a4b70261c7e58f493c2792a5', 'duration': 1480, - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, }, { @@ -315,7 +326,7 @@ class PBSIE(InfoExtractor): 'title': 'FRONTLINE - The Atomic Artists', 'description': 'md5:f677e4520cfacb4a5ce1471e31b57800', 'duration': 723, - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', }, 'params': { 'skip_download': True, # requires ffmpeg @@ -330,7 +341,7 @@ class PBSIE(InfoExtractor): 'ext': 'mp4', 'title': 'FRONTLINE - Netanyahu at War', 'duration': 6852, - 'thumbnail': 're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg$', 'formats': 'mincount:8', }, }, @@ -379,10 +390,10 @@ class PBSIE(InfoExtractor): # tabbed frontline videos MULTI_PART_REGEXES = ( r']+class="videotab[^"]*"[^>]+vid="(\d+)"', - r']+href=["\']#video-\d+["\'][^>]+data-coveid=["\'](\d+)', + r']+href=["\']#(?:video-|part)\d+["\'][^>]+data-cove[Ii]d=["\'](\d+)', ) for p in MULTI_PART_REGEXES: - tabbed_videos = re.findall(p, webpage) + tabbed_videos = orderedSet(re.findall(p, webpage)) if tabbed_videos: return tabbed_videos, presumptive_id, upload_date, description @@ -462,6 +473,7 @@ class PBSIE(InfoExtractor): redirects.append(redirect) redirect_urls.add(redirect_url) + chapters = [] # Player pages may also serve different qualities for page in ('widget/partnerplayer', 'portalplayer'): player = self._download_webpage( @@ -477,6 +489,20 @@ class PBSIE(InfoExtractor): extract_redirect_urls(video_info) if not info: info = video_info + if not chapters: + for chapter_data in re.findall(r'(?s)chapters\.push\(({.*?})\)', player): + chapter = self._parse_json(chapter_data, video_id, js_to_json, fatal=False) + if not chapter: + continue + start_time = float_or_none(chapter.get('start_time'), 1000) + duration = float_or_none(chapter.get('duration'), 1000) + if start_time is None or duration is None: + continue + chapters.append({ + 'start_time': start_time, + 'end_time': start_time + duration, + 'title': chapter.get('title'), + }) formats = [] http_url = None @@ -489,11 +515,13 @@ class PBSIE(InfoExtractor): headers=self.geo_verification_headers()) if redirect_info['status'] == 'error': + message = self._ERRORS.get( + redirect_info['http_code'], redirect_info['message']) + if redirect_info['http_code'] == 403: + self.raise_geo_restricted( + msg=message, countries=self._GEO_COUNTRIES) raise ExtractorError( - '%s said: %s' % ( - self.IE_NAME, - self._ERRORS.get(redirect_info['http_code'], redirect_info['message'])), - expected=True) + '%s said: %s' % (self.IE_NAME, message), expected=True) format_url = redirect_info.get('url') if not format_url: @@ -511,7 +539,7 @@ class PBSIE(InfoExtractor): http_url = format_url self._remove_duplicate_formats(formats) m3u8_formats = list(filter( - lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', + lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none', formats)) if http_url: for m3u8_format in m3u8_formats: @@ -568,7 +596,7 @@ class PBSIE(InfoExtractor): # Try turning it to 'program - title' naming scheme if possible alt_title = info.get('program', {}).get('title') if alt_title: - info['title'] = alt_title + ' - ' + re.sub(r'^' + alt_title + '[\s\-:]+', '', info['title']) + info['title'] = alt_title + ' - ' + re.sub(r'^' + alt_title + r'[\s\-:]+', '', info['title']) description = info.get('description') or info.get( 'program', {}).get('description') or description @@ -584,4 +612,5 @@ class PBSIE(InfoExtractor): 'upload_date': upload_date, 'formats': formats, 'subtitles': subtitles, + 'chapters': chapters, }