From: remitamine Date: Sat, 6 Feb 2016 05:26:02 +0000 (+0100) Subject: Merge pull request #8408 from remitamine/dash X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=commitdiff_plain;h=66159b38aad38d55f84a358a0c2ed2add9a2946d;hp=255732f0d33268aeababb1b3ce37a1defb5bc965;p=youtube-dl Merge pull request #8408 from remitamine/dash Add generic support for mpd manifests(dash formats) --- diff --git a/README.md b/README.md index 7c582511f..79cd08df4 100644 --- a/README.md +++ b/README.md @@ -455,6 +455,8 @@ The `-o` option allows users to indicate a template for the output file names. T - `format_id`: The sequence will be replaced by the format code specified by `--format`. - `duration`: The sequence will be replaced by the length of the video in seconds. +Note that some of the aforementioned sequences are not guaranteed to be present since they depend on the metadata obtained by particular extractor, such sequences will be replaced with `NA`. + The current default template is `%(title)s-%(id)s.%(ext)s`. In some cases, you don't want special characters such as 中, spaces, or &, such as when transferring the downloaded filename to a Windows system or the filename through an 8bit-unsafe channel. In these cases, add the `--restrict-filenames` flag to get a shorter title: diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 61be9990d..ee34adf26 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -91,6 +91,7 @@ - **Canvas** - **CBS** - **CBSNews**: CBS News + - **CBSNewsLiveVideo**: CBS News Live Videos - **CBSSports** - **CeskaTelevize** - **channel9**: Channel 9 diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 73910eaec..88c63010e 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -248,6 +248,17 @@ class TestFormatSelection(unittest.TestCase): def format_info(f_id): info = YoutubeIE._formats[f_id].copy() + + # XXX: In real cases InfoExtractor._parse_mpd() fills up 'acodec' + # and 'vcodec', while in tests such information is incomplete since + # commit a6c2c24479e5f4827ceb06f64d855329c0a6f593 + # test_YoutubeDL.test_youtube_format_selection is broken without + # this fix + if 'acodec' in info and 'vcodec' not in info: + info['vcodec'] = 'none' + elif 'vcodec' in info and 'acodec' not in info: + info['acodec'] = 'none' + info['format_id'] = f_id info['url'] = 'url:' + f_id return info diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 9a695c4e8..27e763edd 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -65,16 +65,16 @@ class TestYoutubeSubtitles(BaseTestSubtitles): self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(len(subtitles.keys()), 13) - self.assertEqual(md5(subtitles['en']), '4cd9278a35ba2305f47354ee13472260') - self.assertEqual(md5(subtitles['it']), '164a51f16f260476a05b50fe4c2f161d') - for lang in ['it', 'fr', 'de']: + self.assertEqual(md5(subtitles['en']), '3cb210999d3e021bd6c7f0ea751eab06') + self.assertEqual(md5(subtitles['it']), '6d752b98c31f1cf8d597050c7a2cb4b5') + for lang in ['fr', 'de']: self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang) - def test_youtube_subtitles_sbv_format(self): + def test_youtube_subtitles_ttml_format(self): self.DL.params['writesubtitles'] = True - self.DL.params['subtitlesformat'] = 'sbv' + self.DL.params['subtitlesformat'] = 'ttml' subtitles = self.getSubtitles() - self.assertEqual(md5(subtitles['en']), '13aeaa0c245a8bed9a451cb643e3ad8b') + self.assertEqual(md5(subtitles['en']), 'e306f8c42842f723447d9f63ad65df54') def test_youtube_subtitles_vtt_format(self): self.DL.params['writesubtitles'] = True diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index e61a88de7..2fbc7f812 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -90,7 +90,10 @@ from .canalplus import CanalplusIE from .canalc2 import Canalc2IE from .canvas import CanvasIE from .cbs import CBSIE -from .cbsnews import CBSNewsIE +from .cbsnews import ( + CBSNewsIE, + CBSNewsLiveVideoIE, +) from .cbssports import CBSSportsIE from .ccc import CCCIE from .ceskatelevize import CeskaTelevizeIE @@ -819,7 +822,11 @@ from .videomore import ( ) from .videopremium import VideoPremiumIE from .videott import VideoTtIE -from .vidme import VidmeIE +from .vidme import ( + VidmeIE, + VidmeUserIE, + VidmeUserLikesIE, +) from .vidzi import VidziIE from .vier import VierIE, VierVideosIE from .viewster import ViewsterIE diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index b9e07f0ef..6ed855a57 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -13,6 +13,7 @@ from ..utils import ( unified_strdate, get_element_by_attribute, int_or_none, + NO_DEFAULT, qualities, ) @@ -93,9 +94,18 @@ class ArteTVPlus7IE(InfoExtractor): json_url = self._html_search_regex( patterns, webpage, 'json vp url', default=None) if not json_url: - iframe_url = self._html_search_regex( - r']+src=(["\'])(?P.+\bjson_url=.+?)\1', - webpage, 'iframe url', group='url') + def find_iframe_url(webpage, default=NO_DEFAULT): + return self._html_search_regex( + r']+src=(["\'])(?P.+\bjson_url=.+?)\1', + webpage, 'iframe url', group='url', default=default) + + iframe_url = find_iframe_url(webpage, None) + if not iframe_url: + embed_url = self._html_search_regex( + r'arte_vp_url_oembed=\'([^\']+?)\'', webpage, 'embed url') + player = self._download_json( + embed_url, video_id, 'Downloading player page') + iframe_url = find_iframe_url(player['html']) json_url = compat_parse_qs( compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0] return self._extract_from_json_url(json_url, video_id, lang) diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py index cabf7e73b..8f864699f 100644 --- a/youtube_dl/extractor/cbsnews.py +++ b/youtube_dl/extractor/cbsnews.py @@ -1,15 +1,14 @@ # encoding: utf-8 from __future__ import unicode_literals -import re -import json - +from .common import InfoExtractor from .theplatform import ThePlatformIE +from ..utils import parse_duration class CBSNewsIE(ThePlatformIE): IE_DESC = 'CBS News' - _VALID_URL = r'http://(?:www\.)?cbsnews\.com/(?:[^/]+/)+(?P[\da-z_-]+)' + _VALID_URL = r'http://(?:www\.)?cbsnews\.com/(?:news|videos)/(?P[\da-z_-]+)' _TESTS = [ { @@ -48,14 +47,13 @@ class CBSNewsIE(ThePlatformIE): ] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - video_info = json.loads(self._html_search_regex( + video_info = self._parse_json(self._html_search_regex( r'(?: