From: Yen Chi Hsuan Date: Sun, 21 Aug 2016 16:48:35 +0000 (+0800) Subject: Merge branch 'charlierose' of https://github.com/TRox1972/youtube-dl into TRox1972... X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=commitdiff_plain;h=d13bfc07b79bafa0be6829a18d0d2e273e83339b;hp=db29af6d36b3d16614355dac70f22c4f2d8410d2;p=youtube-dl Merge branch 'charlierose' of https://github.com/TRox1972/youtube-dl into TRox1972-charlierose --- diff --git a/README.md b/README.md index 952db7abb..a10aaf35c 100644 --- a/README.md +++ b/README.md @@ -645,7 +645,11 @@ $ youtube-dl -f 'best[filesize<50M]' # Download best format available via direct link over HTTP/HTTPS protocol $ youtube-dl -f '(bestvideo+bestaudio/best)[protocol^=http]' + +# Download the best video format and the best audio format without merging them +$ youtube-dl -f 'bestvideo,bestaudio' -o '%(title)s.f%(format_id)s.%(ext)s' ``` +Note that in the last example, an output template is recommended as bestvideo and bestaudio may have the same file name. # VIDEO SELECTION diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index e844dc98a..0b3e3da82 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -1299,7 +1299,7 @@ class YoutubeDL(object): for subtitle_format in subtitle: if subtitle_format.get('url'): subtitle_format['url'] = sanitize_url(subtitle_format['url']) - if 'ext' not in subtitle_format: + if subtitle_format.get('ext') is None: subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower() if self.params.get('listsubtitles', False): @@ -1354,7 +1354,7 @@ class YoutubeDL(object): note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '', ) # Automatically determine file extension if missing - if 'ext' not in format: + if format.get('ext') is None: format['ext'] = determine_ext(format['url']).lower() # Automatically determine protocol if missing (useful for format # selection purposes) diff --git a/youtube_dl/extractor/cbs.py b/youtube_dl/extractor/cbs.py index a23173d6f..c72ed2dbb 100644 --- a/youtube_dl/extractor/cbs.py +++ b/youtube_dl/extractor/cbs.py @@ -4,6 +4,7 @@ from .theplatform import ThePlatformFeedIE from ..utils import ( int_or_none, find_xpath_attr, + ExtractorError, ) @@ -17,19 +18,6 @@ class CBSBaseIE(ThePlatformFeedIE): }] } if closed_caption_e is not None and closed_caption_e.attrib.get('value') else [] - def _extract_video_info(self, filter_query, video_id): - return self._extract_feed_info( - 'dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id, lambda entry: { - 'series': entry.get('cbs$SeriesTitle'), - 'season_number': int_or_none(entry.get('cbs$SeasonNumber')), - 'episode': entry.get('cbs$EpisodeTitle'), - 'episode_number': int_or_none(entry.get('cbs$EpisodeNumber')), - }, { - 'StreamPack': { - 'manifest': 'm3u', - } - }) - class CBSIE(CBSBaseIE): _VALID_URL = r'(?:cbs:|https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P[\w-]+)' @@ -38,7 +26,6 @@ class CBSIE(CBSBaseIE): 'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/', 'info_dict': { 'id': '_u7W953k6la293J7EPTd9oHkSPs6Xn6_', - 'display_id': 'connect-chat-feat-garth-brooks', 'ext': 'mp4', 'title': 'Connect Chat feat. Garth Brooks', 'description': 'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!', @@ -47,7 +34,10 @@ class CBSIE(CBSBaseIE): 'upload_date': '20131127', 'uploader': 'CBSI-NEW', }, - 'expected_warnings': ['Failed to download m3u8 information'], + 'params': { + # m3u8 download + 'skip_download': True, + }, '_skip': 'Blocked outside the US', }, { 'url': 'http://colbertlateshow.com/video/8GmB0oY0McANFvp2aEffk9jZZZ2YyXxy/the-colbeard/', @@ -56,8 +46,31 @@ class CBSIE(CBSBaseIE): 'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/', 'only_matching': True, }] - TP_RELEASE_URL_TEMPLATE = 'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true' + + def _extract_video_info(self, guid): + path = 'dJ5BDC/media/guid/2198311517/' + guid + smil_url = 'http://link.theplatform.com/s/%s?mbr=true' % path + formats, subtitles = self._extract_theplatform_smil(smil_url + '&manifest=m3u', guid) + for r in ('HLS&formats=M3U', 'RTMP', 'WIFI', '3G'): + try: + tp_formats, _ = self._extract_theplatform_smil(smil_url + '&assetTypes=' + r, guid, 'Downloading %s SMIL data' % r.split('&')[0]) + formats.extend(tp_formats) + except ExtractorError: + continue + self._sort_formats(formats) + metadata = self._download_theplatform_metadata(path, guid) + info = self._parse_theplatform_metadata(metadata) + info.update({ + 'id': guid, + 'formats': formats, + 'subtitles': subtitles, + 'series': metadata.get('cbs$SeriesTitle'), + 'season_number': int_or_none(metadata.get('cbs$SeasonNumber')), + 'episode': metadata.get('cbs$EpisodeTitle'), + 'episode_number': int_or_none(metadata.get('cbs$EpisodeNumber')), + }) + return info def _real_extract(self, url): content_id = self._match_id(url) - return self._extract_video_info('byGuid=%s' % content_id, content_id) + return self._extract_video_info(content_id) diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py index 9d3b75526..4aa6917a0 100644 --- a/youtube_dl/extractor/cbsnews.py +++ b/youtube_dl/extractor/cbsnews.py @@ -2,13 +2,13 @@ from __future__ import unicode_literals from .common import InfoExtractor -from .cbs import CBSBaseIE +from .cbs import CBSIE from ..utils import ( parse_duration, ) -class CBSNewsIE(CBSBaseIE): +class CBSNewsIE(CBSIE): IE_DESC = 'CBS News' _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/(?:news|videos)/(?P[\da-z_-]+)' @@ -35,7 +35,8 @@ class CBSNewsIE(CBSBaseIE): 'ext': 'mp4', 'title': 'Fort Hood shooting: Army downplays mental illness as cause of attack', 'description': 'md5:4a6983e480542d8b333a947bfc64ddc7', - 'upload_date': '19700101', + 'upload_date': '20140404', + 'timestamp': 1396650660, 'uploader': 'CBSI-NEW', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 205, @@ -63,7 +64,7 @@ class CBSNewsIE(CBSBaseIE): item = video_info['item'] if 'item' in video_info else video_info guid = item['mpxRefId'] - return self._extract_video_info('byGuid=%s' % guid, guid) + return self._extract_video_info(guid) class CBSNewsLiveVideoIE(InfoExtractor): diff --git a/youtube_dl/extractor/cbssports.py b/youtube_dl/extractor/cbssports.py index 78ca44b02..bf7915626 100644 --- a/youtube_dl/extractor/cbssports.py +++ b/youtube_dl/extractor/cbssports.py @@ -23,6 +23,9 @@ class CBSSportsIE(CBSBaseIE): } }] + def _extract_video_info(self, filter_query, video_id): + return self._extract_feed_info('dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id) + def _real_extract(self, url): video_id = self._match_id(url) return self._extract_video_info('byId=%s' % video_id, video_id) diff --git a/youtube_dl/extractor/cnn.py b/youtube_dl/extractor/cnn.py index 53489a14e..220bb55e8 100644 --- a/youtube_dl/extractor/cnn.py +++ b/youtube_dl/extractor/cnn.py @@ -11,7 +11,7 @@ from ..utils import ( class CNNIE(InfoExtractor): - _VALID_URL = r'''(?x)https?://(?:(?:edition|www)\.)?cnn\.com/video/(?:data/.+?|\?)/ + _VALID_URL = r'''(?x)https?://(?:(?Pedition|www|money)\.)?cnn\.com/(?:video/(?:data/.+?|\?)/)?videos?/ (?P.+?/(?P[^/]+?)(?:\.(?:[a-z\-]+)|(?=&)))''' _TESTS = [{ @@ -45,19 +45,46 @@ class CNNIE(InfoExtractor): 'description': 'md5:e7223a503315c9f150acac52e76de086', 'upload_date': '20141222', } + }, { + 'url': 'http://money.cnn.com/video/news/2016/08/19/netflix-stunning-stats.cnnmoney/index.html', + 'md5': '52a515dc1b0f001cd82e4ceda32be9d1', + 'info_dict': { + 'id': '/video/news/2016/08/19/netflix-stunning-stats.cnnmoney', + 'ext': 'mp4', + 'title': '5 stunning stats about Netflix', + 'description': 'Did you know that Netflix has more than 80 million members? Here are five facts about the online video distributor that you probably didn\'t know.', + 'upload_date': '20160819', + } }, { 'url': 'http://cnn.com/video/?/video/politics/2015/03/27/pkg-arizona-senator-church-attendance-mandatory.ktvk', 'only_matching': True, }, { 'url': 'http://cnn.com/video/?/video/us/2015/04/06/dnt-baker-refuses-anti-gay-order.wkmg', 'only_matching': True, + }, { + 'url': 'http://edition.cnn.com/videos/arts/2016/04/21/olympic-games-cultural-a-z-brazil.cnn', + 'only_matching': True, }] + _CONFIG = { + # http://edition.cnn.com/.element/apps/cvp/3.0/cfg/spider/cnn/expansion/config.xml + 'edition': { + 'data_src': 'http://edition.cnn.com/video/data/3.0/video/%s/index.xml', + 'media_src': 'http://pmd.cdn.turner.com/cnn/big', + }, + # http://money.cnn.com/.element/apps/cvp2/cfg/config.xml + 'money': { + 'data_src': 'http://money.cnn.com/video/data/4.0/video/%s.xml', + 'media_src': 'http://ht3.cdn.turner.com/money/big', + }, + } + def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - path = mobj.group('path') - page_title = mobj.group('title') - info_url = 'http://edition.cnn.com/video/data/3.0/%s/index.xml' % path + sub_domain, path, page_title = re.match(self._VALID_URL, url).groups() + if sub_domain not in ('money', 'edition'): + sub_domain = 'edition' + config = self._CONFIG[sub_domain] + info_url = config['data_src'] % path info = self._download_xml(info_url, page_title) formats = [] @@ -66,7 +93,7 @@ class CNNIE(InfoExtractor): (?:_(?P<bitrate>[0-9]+)k)? ''') for f in info.findall('files/file'): - video_url = 'http://ht.cdn.turner.com/cnn/big%s' % (f.text.strip()) + video_url = config['media_src'] + f.text.strip() fdct = { 'format_id': f.attrib['bitrate'], 'url': video_url, @@ -146,7 +173,7 @@ class CNNBlogsIE(InfoExtractor): class CNNArticleIE(InfoExtractor): - _VALID_URL = r'https?://(?:(?:edition|www)\.)?cnn\.com/(?!video/)' + _VALID_URL = r'https?://(?:(?:edition|www)\.)?cnn\.com/(?!videos?/)' _TEST = { 'url': 'http://www.cnn.com/2014/12/21/politics/obama-north-koreas-hack-not-war-but-cyber-vandalism/', 'md5': '689034c2a3d9c6dc4aa72d65a81efd01', diff --git a/youtube_dl/extractor/cultureunplugged.py b/youtube_dl/extractor/cultureunplugged.py index 9c764fe68..9f26fa587 100644 --- a/youtube_dl/extractor/cultureunplugged.py +++ b/youtube_dl/extractor/cultureunplugged.py @@ -1,9 +1,13 @@ from __future__ import unicode_literals import re +import time from .common import InfoExtractor -from ..utils import int_or_none +from ..utils import ( + int_or_none, + HEADRequest, +) class CultureUnpluggedIE(InfoExtractor): @@ -32,6 +36,9 @@ class CultureUnpluggedIE(InfoExtractor): video_id = mobj.group('id') display_id = mobj.group('display_id') or video_id + # request setClientTimezone.php to get PHPSESSID cookie which is need to get valid json data in the next request + self._request_webpage(HEADRequest( + 'http://www.cultureunplugged.com/setClientTimezone.php?timeOffset=%d' % -(time.timezone / 3600)), display_id) movie_data = self._download_json( 'http://www.cultureunplugged.com/movie-data/cu-%s.json' % video_id, display_id) diff --git a/youtube_dl/extractor/dotsub.py b/youtube_dl/extractor/dotsub.py index e9ca236d4..fd64d1a7f 100644 --- a/youtube_dl/extractor/dotsub.py +++ b/youtube_dl/extractor/dotsub.py @@ -10,18 +10,18 @@ from ..utils import ( class DotsubIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?dotsub\.com/view/(?P<id>[^/]+)' _TEST = { - 'url': 'http://dotsub.com/view/aed3b8b2-1889-4df5-ae63-ad85f5572f27', - 'md5': '0914d4d69605090f623b7ac329fea66e', + 'url': 'https://dotsub.com/view/9c63db2a-fa95-4838-8e6e-13deafe47f09', + 'md5': '21c7ff600f545358134fea762a6d42b6', 'info_dict': { - 'id': 'aed3b8b2-1889-4df5-ae63-ad85f5572f27', + 'id': '9c63db2a-fa95-4838-8e6e-13deafe47f09', 'ext': 'flv', - 'title': 'Pyramids of Waste (2010), AKA The Lightbulb Conspiracy - Planned obsolescence documentary', - 'description': 'md5:699a0f7f50aeec6042cb3b1db2d0d074', - 'thumbnail': 're:^https?://dotsub.com/media/aed3b8b2-1889-4df5-ae63-ad85f5572f27/p', - 'duration': 3169, - 'uploader': '4v4l0n42', - 'timestamp': 1292248482.625, - 'upload_date': '20101213', + 'title': 'MOTIVATION - "It\'s Possible" Best Inspirational Video Ever', + 'description': 'md5:41af1e273edbbdfe4e216a78b9d34ac6', + 'thumbnail': 're:^https?://dotsub.com/media/9c63db2a-fa95-4838-8e6e-13deafe47f09/p', + 'duration': 198, + 'uploader': 'liuxt', + 'timestamp': 1385778501.104, + 'upload_date': '20131130', 'view_count': int, } } diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index 0acce9f4c..3a6a6f5ad 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -6,6 +6,7 @@ from .common import InfoExtractor from ..utils import ( mimetype2ext, qualities, + remove_end, ) @@ -19,7 +20,7 @@ class ImdbIE(InfoExtractor): 'info_dict': { 'id': '2524815897', 'ext': 'mp4', - 'title': 'Ice Age: Continental Drift Trailer (No. 2) - IMDb', + 'title': 'Ice Age: Continental Drift Trailer (No. 2)', 'description': 'md5:9061c2219254e5d14e03c25c98e96a81', } }, { @@ -83,10 +84,10 @@ class ImdbIE(InfoExtractor): return { 'id': video_id, - 'title': self._og_search_title(webpage), + 'title': remove_end(self._og_search_title(webpage), ' - IMDb'), 'formats': formats, 'description': descr, - 'thumbnail': format_info['slate'], + 'thumbnail': format_info.get('slate'), } diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index ddf1165ff..e0f7366c2 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -67,6 +67,27 @@ class KalturaIE(InfoExtractor): # video with subtitles 'url': 'kaltura:111032:1_cw786r8q', 'only_matching': True, + }, + { + # video with ttml subtitles (no fileExt) + 'url': 'kaltura:1926081:0_l5ye1133', + 'info_dict': { + 'id': '0_l5ye1133', + 'ext': 'mp4', + 'title': 'What Can You Do With Python?', + 'upload_date': '20160221', + 'uploader_id': 'stork', + 'thumbnail': 're:^https?://.*/thumbnail/.*', + 'timestamp': int, + 'subtitles': { + 'en': [{ + 'ext': 'ttml', + }], + }, + }, + 'params': { + 'skip_download': True, + }, } ] @@ -122,18 +143,6 @@ class KalturaIE(InfoExtractor): return data - def _get_kaltura_signature(self, video_id, partner_id, service_url=None): - actions = [{ - 'apiVersion': '3.1', - 'expiry': 86400, - 'format': 1, - 'service': 'session', - 'action': 'startWidgetSession', - 'widgetId': '_%s' % partner_id, - }] - return self._kaltura_api_call( - video_id, actions, service_url, note='Downloading Kaltura signature')['ks'] - def _get_video_info(self, video_id, partner_id, service_url=None): actions = [ { @@ -208,6 +217,17 @@ class KalturaIE(InfoExtractor): reference_id)['entryResult'] info, flavor_assets = entry_data['meta'], entry_data['contextData']['flavorAssets'] entry_id = info['id'] + # Unfortunately, data returned in kalturaIframePackageData lacks + # captions so we will try requesting the complete data using + # regular approach since we now know the entry_id + try: + _, info, flavor_assets, captions = self._get_video_info( + entry_id, partner_id) + except ExtractorError: + # Regular scenario failed but we already have everything + # extracted apart from captions and can process at least + # with this + pass else: raise ExtractorError('Invalid URL', expected=True) ks = params.get('flashvars[ks]', [None])[0] @@ -267,7 +287,7 @@ class KalturaIE(InfoExtractor): continue subtitles.setdefault(caption.get('languageCode') or caption.get('language'), []).append({ 'url': '%s/api_v3/service/caption_captionasset/action/serve/captionAssetId/%s' % (self._SERVICE_URL, caption['id']), - 'ext': caption.get('fileExt'), + 'ext': caption.get('fileExt', 'ttml'), }) return {