projects
/
youtube-dl
/ commitdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
| commitdiff |
tree
raw
|
patch
|
inline
| side by side (parent:
82997da
)
[pbs] Fix description extraction and update tests
author
Sergey M․
<dstftw@gmail.com>
Sat, 13 Aug 2016 14:59:29 +0000
(21:59 +0700)
committer
Sergey M․
<dstftw@gmail.com>
Sat, 13 Aug 2016 14:59:29 +0000
(21:59 +0700)
youtube_dl/extractor/pbs.py
patch
|
blob
|
history
diff --git
a/youtube_dl/extractor/pbs.py
b/youtube_dl/extractor/pbs.py
index 335e44bdc7bfa5d9d86de38a200a890ce14e1040..09aef7fb9f455d8d3247350b26a6d66043877605 100644
(file)
--- a/
youtube_dl/extractor/pbs.py
+++ b/
youtube_dl/extractor/pbs.py
@@
-10,6
+10,7
@@
from ..utils import (
int_or_none,
js_to_json,
strip_jsonp,
int_or_none,
js_to_json,
strip_jsonp,
+ strip_or_none,
unified_strdate,
US_RATINGS,
)
unified_strdate,
US_RATINGS,
)
@@
-200,7
+201,7
@@
class PBSIE(InfoExtractor):
'id': '2365006249',
'ext': 'mp4',
'title': 'Constitution USA with Peter Sagal - A More Perfect Union',
'id': '2365006249',
'ext': 'mp4',
'title': 'Constitution USA with Peter Sagal - A More Perfect Union',
- 'description': 'md5:3
6f341ae62e251b8f5bd2b754b95a071
',
+ 'description': 'md5:3
1b664af3c65fd07fa460d306b837d00
',
'duration': 3190,
},
},
'duration': 3190,
},
},
@@
-211,7
+212,7
@@
class PBSIE(InfoExtractor):
'id': '2365297690',
'ext': 'mp4',
'title': 'FRONTLINE - Losing Iraq',
'id': '2365297690',
'ext': 'mp4',
'title': 'FRONTLINE - Losing Iraq',
- 'description': 'md5:
4d3eaa01f94e61b3e73704735f1196d9
',
+ 'description': 'md5:
5979a4d069b157f622d02bff62fbe654
',
'duration': 5050,
},
},
'duration': 5050,
},
},
@@
-222,7
+223,7
@@
class PBSIE(InfoExtractor):
'id': '2201174722',
'ext': 'mp4',
'title': 'PBS NewsHour - Cyber Schools Gain Popularity, but Quality Questions Persist',
'id': '2201174722',
'ext': 'mp4',
'title': 'PBS NewsHour - Cyber Schools Gain Popularity, but Quality Questions Persist',
- 'description': 'md5:
95a19f568689d09a166dff9edada330
1',
+ 'description': 'md5:
86ab9a3d04458b876147b355788b878
1',
'duration': 801,
},
},
'duration': 801,
},
},
@@
-267,7
+268,7
@@
class PBSIE(InfoExtractor):
'display_id': 'player',
'ext': 'mp4',
'title': 'American Experience - Death and the Civil War, Chapter 1',
'display_id': 'player',
'ext': 'mp4',
'title': 'American Experience - Death and the Civil War, Chapter 1',
- 'description': 'md5:
1b80a74e0380ed2a4fb335026de1600d
',
+ 'description': 'md5:
67fa89a9402e2ee7d08f53b920674c18
',
'duration': 682,
'thumbnail': 're:^https?://.*\.jpg$',
},
'duration': 682,
'thumbnail': 're:^https?://.*\.jpg$',
},
@@
-293,13
+294,13
@@
class PBSIE(InfoExtractor):
# "<iframe style='position: absolute;<br />\ntop: 0; left: 0;' ...", see
# https://github.com/rg3/youtube-dl/issues/7059)
'url': 'http://www.pbs.org/food/features/a-chefs-life-season-3-episode-5-prickly-business/',
# "<iframe style='position: absolute;<br />\ntop: 0; left: 0;' ...", see
# https://github.com/rg3/youtube-dl/issues/7059)
'url': 'http://www.pbs.org/food/features/a-chefs-life-season-3-episode-5-prickly-business/',
- 'md5': '
84ced42850d78f1d4650297356e95e6f
',
+ 'md5': '
59b0ef5009f9ac8a319cc5efebcd865e
',
'info_dict': {
'id': '2365546844',
'display_id': 'a-chefs-life-season-3-episode-5-prickly-business',
'ext': 'mp4',
'title': "A Chef's Life - Season 3, Ep. 5: Prickly Business",
'info_dict': {
'id': '2365546844',
'display_id': 'a-chefs-life-season-3-episode-5-prickly-business',
'ext': 'mp4',
'title': "A Chef's Life - Season 3, Ep. 5: Prickly Business",
- 'description': 'md5:
54033c6baa1f9623607c6e2ed245888b
',
+ 'description': 'md5:
c0ff7475a4b70261c7e58f493c2792a5
',
'duration': 1480,
'thumbnail': 're:^https?://.*\.jpg$',
},
'duration': 1480,
'thumbnail': 're:^https?://.*\.jpg$',
},
@@
-312,7
+313,7
@@
class PBSIE(InfoExtractor):
'display_id': 'the-atomic-artists',
'ext': 'mp4',
'title': 'FRONTLINE - The Atomic Artists',
'display_id': 'the-atomic-artists',
'ext': 'mp4',
'title': 'FRONTLINE - The Atomic Artists',
- 'description': 'md5:
1a2481e86b32b2e12ec1905dd473e2c1
',
+ 'description': 'md5:
f677e4520cfacb4a5ce1471e31b57800
',
'duration': 723,
'thumbnail': 're:^https?://.*\.jpg$',
},
'duration': 723,
'thumbnail': 're:^https?://.*\.jpg$',
},
@@
-323,7
+324,7
@@
class PBSIE(InfoExtractor):
{
# Serves hd only via wigget/partnerplayer page
'url': 'http://www.pbs.org/video/2365641075/',
{
# Serves hd only via wigget/partnerplayer page
'url': 'http://www.pbs.org/video/2365641075/',
- 'md5': '
acfd4c400b48149a44861cb16dd305cf
',
+ 'md5': '
fdf907851eab57211dd589cf12006666
',
'info_dict': {
'id': '2365641075',
'ext': 'mp4',
'info_dict': {
'id': '2365641075',
'ext': 'mp4',
@@
-333,16
+334,6
@@
class PBSIE(InfoExtractor):
'formats': 'mincount:8',
},
},
'formats': 'mincount:8',
},
},
- {
- # has undocumented http formats(4500k and 6500k)
- 'url': 'http://www.pbs.org/video/2365815229/',
- 'md5': '94635cd06b7133688e23f4b94e6637a5',
- 'info_dict': {
- 'id': '2365815229',
- 'ext': 'mp4',
- 'title': 'FRONTLINE - Mosquito Hunter',
- },
- },
{
'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true',
'only_matching': True,
{
'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true',
'only_matching': True,
@@
-362,11
+353,16
@@
class PBSIE(InfoExtractor):
def _extract_webpage(self, url):
mobj = re.match(self._VALID_URL, url)
def _extract_webpage(self, url):
mobj = re.match(self._VALID_URL, url)
+ description = None
+
presumptive_id = mobj.group('presumptive_id')
display_id = presumptive_id
if presumptive_id:
webpage = self._download_webpage(url, display_id)
presumptive_id = mobj.group('presumptive_id')
display_id = presumptive_id
if presumptive_id:
webpage = self._download_webpage(url, display_id)
+ description = strip_or_none(self._og_search_description(
+ webpage, default=None) or self._html_search_meta(
+ 'description', webpage, default=None))
upload_date = unified_strdate(self._search_regex(
r'<input type="hidden" id="air_date_[0-9]+" value="([^"]+)"',
webpage, 'upload date', default=None))
upload_date = unified_strdate(self._search_regex(
r'<input type="hidden" id="air_date_[0-9]+" value="([^"]+)"',
webpage, 'upload date', default=None))
@@
-379,7
+375,7
@@
class PBSIE(InfoExtractor):
for p in MULTI_PART_REGEXES:
tabbed_videos = re.findall(p, webpage)
if tabbed_videos:
for p in MULTI_PART_REGEXES:
tabbed_videos = re.findall(p, webpage)
if tabbed_videos:
- return tabbed_videos, presumptive_id, upload_date
+ return tabbed_videos, presumptive_id, upload_date
, description
MEDIA_ID_REGEXES = [
r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'", # frontline video embed
MEDIA_ID_REGEXES = [
r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'", # frontline video embed
@@
-391,7
+387,7
@@
class PBSIE(InfoExtractor):
media_id = self._search_regex(
MEDIA_ID_REGEXES, webpage, 'media ID', fatal=False, default=None)
if media_id:
media_id = self._search_regex(
MEDIA_ID_REGEXES, webpage, 'media ID', fatal=False, default=None)
if media_id:
- return media_id, presumptive_id, upload_date
+ return media_id, presumptive_id, upload_date
, description
# Fronline video embedded via flp
video_id = self._search_regex(
# Fronline video embedded via flp
video_id = self._search_regex(
@@
-408,7
+404,7
@@
class PBSIE(InfoExtractor):
'http://www.pbs.org/wgbh/pages/frontline/.json/getdir/getdir%d.json' % prg_id,
presumptive_id, 'Downloading getdir JSON',
transform_source=strip_jsonp)
'http://www.pbs.org/wgbh/pages/frontline/.json/getdir/getdir%d.json' % prg_id,
presumptive_id, 'Downloading getdir JSON',
transform_source=strip_jsonp)
- return getdir['mid'], presumptive_id, upload_date
+ return getdir['mid'], presumptive_id, upload_date
, description
for iframe in re.findall(r'(?s)<iframe(.+?)></iframe>', webpage):
url = self._search_regex(
for iframe in re.findall(r'(?s)<iframe(.+?)></iframe>', webpage):
url = self._search_regex(
@@
-432,10
+428,10
@@
class PBSIE(InfoExtractor):
video_id = mobj.group('id')
display_id = video_id
video_id = mobj.group('id')
display_id = video_id
- return video_id, display_id, None
+ return video_id, display_id, None
, description
def _real_extract(self, url):
def _real_extract(self, url):
- video_id, display_id, upload_date = self._extract_webpage(url)
+ video_id, display_id, upload_date
, description
= self._extract_webpage(url)
if isinstance(video_id, list):
entries = [self.url_result(
if isinstance(video_id, list):
entries = [self.url_result(
@@
-564,11
+560,14
@@
class PBSIE(InfoExtractor):
if alt_title:
info['title'] = alt_title + ' - ' + re.sub(r'^' + alt_title + '[\s\-:]+', '', info['title'])
if alt_title:
info['title'] = alt_title + ' - ' + re.sub(r'^' + alt_title + '[\s\-:]+', '', info['title'])
+ description = info.get('description') or info.get(
+ 'program', {}).get('description') or description
+
return {
'id': video_id,
'display_id': display_id,
'title': info['title'],
return {
'id': video_id,
'display_id': display_id,
'title': info['title'],
- 'description':
info.get('description') or info.get('program', {}).get('description')
,
+ 'description':
description
,
'thumbnail': info.get('image_url'),
'duration': int_or_none(info.get('duration')),
'age_limit': age_limit,
'thumbnail': info.get('image_url'),
'duration': int_or_none(info.get('duration')),
'age_limit': age_limit,