X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Farte.py;h=efde7e207bc8d166e80f2a26429797684535d114;hb=d1e440a4a18522207a1a3e624bf801c8338f9146;hp=7ef42e0924b9e42829dbc8f3ae0e59e1671af23f;hpb=9c54ae3387b883bec5f014e2ac864a9a6c109163;p=youtube-dl diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 7ef42e092..efde7e207 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -13,6 +13,7 @@ from ..utils import ( unified_strdate, get_element_by_attribute, int_or_none, + NO_DEFAULT, qualities, ) @@ -22,7 +23,7 @@ from ..utils import ( class ArteTvIE(InfoExtractor): - _VALID_URL = r'http://videos\.arte\.tv/(?Pfr|de)/.*-(?P.*?)\.html' + _VALID_URL = r'http://videos\.arte\.tv/(?Pfr|de|en|es)/.*-(?P.*?)\.html' IE_NAME = 'arte.tv' def _real_extract(self, url): @@ -62,7 +63,7 @@ class ArteTvIE(InfoExtractor): class ArteTVPlus7IE(InfoExtractor): IE_NAME = 'arte.tv:+7' - _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?Pfr|de)/(?:(?:sendungen|emissions)/)?(?P.*?)/(?P.*?)(\?.*)?' + _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?Pfr|de|en|es)/(?:(?:sendungen|emissions|embed)/)?(?P[^/]+)/(?P[^/?#&+])' @classmethod def _extract_url_info(cls, url): @@ -93,12 +94,40 @@ class ArteTVPlus7IE(InfoExtractor): json_url = self._html_search_regex( patterns, webpage, 'json vp url', default=None) if not json_url: - iframe_url = self._html_search_regex( - r']+src=(["\'])(?P.+\bjson_url=.+?)\1', - webpage, 'iframe url', group='url') - json_url = compat_parse_qs( - compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0] - return self._extract_from_json_url(json_url, video_id, lang) + def find_iframe_url(webpage, default=NO_DEFAULT): + return self._html_search_regex( + r']+src=(["\'])(?P.+\bjson_url=.+?)\1', + webpage, 'iframe url', group='url', default=default) + + iframe_url = find_iframe_url(webpage, None) + if not iframe_url: + embed_url = self._html_search_regex( + r'arte_vp_url_oembed=\'([^\']+?)\'', webpage, 'embed url', default=None) + if embed_url: + player = self._download_json( + embed_url, video_id, 'Downloading player page') + iframe_url = find_iframe_url(player['html']) + # en and es URLs produce react-based pages with different layout (e.g. + # http://www.arte.tv/guide/en/053330-002-A/carnival-italy?zone=world) + if not iframe_url: + program = self._search_regex( + r'program\s*:\s*({.+?["\']embed_html["\'].+?}),?\s*\n', + webpage, 'program', default=None) + if program: + embed_html = self._parse_json(program, video_id) + if embed_html: + iframe_url = find_iframe_url(embed_html['embed_html']) + if iframe_url: + json_url = compat_parse_qs( + compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0] + if json_url: + return self._extract_from_json_url(json_url, video_id, lang) + # Differend kind of embed URL (e.g. + # http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium) + embed_url = self._search_regex( + r']+src=(["\'])(?P.+?)\1', + webpage, 'embed url', group='url') + return self.url_result(embed_url) def _extract_from_json_url(self, json_url, video_id, lang): info = self._download_json(json_url, video_id) @@ -106,7 +135,7 @@ class ArteTVPlus7IE(InfoExtractor): upload_date_str = player_info.get('shootingDate') if not upload_date_str: - upload_date_str = player_info.get('VDA', '').split(' ')[0] + upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0] title = player_info['VTI'].strip() subtitle = player_info.get('VSU', '').strip() @@ -122,27 +151,30 @@ class ArteTVPlus7IE(InfoExtractor): } qfunc = qualities(['HQ', 'MQ', 'EQ', 'SQ']) + LANGS = { + 'fr': 'F', + 'de': 'A', + 'en': 'E[ANG]', + 'es': 'E[ESP]', + } + formats = [] for format_id, format_dict in player_info['VSR'].items(): f = dict(format_dict) versionCode = f.get('versionCode') - - langcode = { - 'fr': 'F', - 'de': 'A', - }.get(lang, lang) - lang_rexs = [r'VO?%s' % langcode, r'VO?.-ST%s' % langcode] - lang_pref = ( - None if versionCode is None else ( - 10 if any(re.match(r, versionCode) for r in lang_rexs) - else -10)) + langcode = LANGS.get(lang, lang) + lang_rexs = [r'VO?%s-' % re.escape(langcode), r'VO?.-ST%s$' % re.escape(langcode)] + lang_pref = None + if versionCode: + matched_lang_rexs = [r for r in lang_rexs if re.match(r, versionCode)] + lang_pref = -10 if not matched_lang_rexs else 10 * len(matched_lang_rexs) source_pref = 0 if versionCode is not None: # The original version with subtitles has lower relevance - if re.match(r'VO-ST(F|A)', versionCode): + if re.match(r'VO-ST(F|A|E)', versionCode): source_pref -= 10 # The version with sourds/mal subtitles has also lower relevance - elif re.match(r'VO?(F|A)-STM\1', versionCode): + elif re.match(r'VO?(F|A|E)-STM\1', versionCode): source_pref -= 9 format = { 'format_id': format_id, @@ -175,7 +207,7 @@ class ArteTVPlus7IE(InfoExtractor): # It also uses the arte_vp_url url from the webpage to extract the information class ArteTVCreativeIE(ArteTVPlus7IE): IE_NAME = 'arte.tv:creative' - _VALID_URL = r'https?://creative\.arte\.tv/(?Pfr|de)/(?:magazine?/)?(?P[^?#]+)' + _VALID_URL = r'https?://creative\.arte\.tv/(?Pfr|de|en|es)/(?:magazine?/)?(?P[^/?#&]+)' _TESTS = [{ 'url': 'http://creative.arte.tv/de/magazin/agentur-amateur-corporate-design', @@ -199,7 +231,7 @@ class ArteTVCreativeIE(ArteTVPlus7IE): class ArteTVFutureIE(ArteTVPlus7IE): IE_NAME = 'arte.tv:future' - _VALID_URL = r'https?://future\.arte\.tv/(?Pfr|de)/(?P.+)' + _VALID_URL = r'https?://future\.arte\.tv/(?Pfr|de|en|es)/(?P[^/?#&]+)' _TESTS = [{ 'url': 'http://future.arte.tv/fr/info-sciences/les-ecrevisses-aussi-sont-anxieuses', @@ -207,6 +239,7 @@ class ArteTVFutureIE(ArteTVPlus7IE): 'id': '050940-028-A', 'ext': 'mp4', 'title': 'Les écrevisses aussi peuvent être anxieuses', + 'upload_date': '20140902', }, }, { 'url': 'http://future.arte.tv/fr/la-science-est-elle-responsable', @@ -216,7 +249,7 @@ class ArteTVFutureIE(ArteTVPlus7IE): class ArteTVDDCIE(ArteTVPlus7IE): IE_NAME = 'arte.tv:ddc' - _VALID_URL = r'https?://ddc\.arte\.tv/(?Pemission|folge)/(?P.+)' + _VALID_URL = r'https?://ddc\.arte\.tv/(?Pemission|folge)/(?P[^/?#&]+)' def _real_extract(self, url): video_id, lang = self._extract_url_info(url) @@ -234,7 +267,7 @@ class ArteTVDDCIE(ArteTVPlus7IE): class ArteTVConcertIE(ArteTVPlus7IE): IE_NAME = 'arte.tv:concert' - _VALID_URL = r'https?://concert\.arte\.tv/(?Pde|fr)/(?P.+)' + _VALID_URL = r'https?://concert\.arte\.tv/(?Pfr|de|en|es)/(?P[^/?#&]+)' _TEST = { 'url': 'http://concert.arte.tv/de/notwist-im-pariser-konzertclub-divan-du-monde', @@ -249,6 +282,54 @@ class ArteTVConcertIE(ArteTVPlus7IE): } +class ArteTVCinemaIE(ArteTVPlus7IE): + IE_NAME = 'arte.tv:cinema' + _VALID_URL = r'https?://cinema\.arte\.tv/(?Pfr|de|en|es)/(?P.+)' + + _TEST = { + 'url': 'http://cinema.arte.tv/de/node/38291', + 'md5': '6b275511a5107c60bacbeeda368c3aa1', + 'info_dict': { + 'id': '055876-000_PWA12025-D', + 'ext': 'mp4', + 'title': 'Tod auf dem Nil', + 'upload_date': '20160122', + 'description': 'md5:7f749bbb77d800ef2be11d54529b96bc', + }, + } + + +class ArteTVMagazineIE(ArteTVPlus7IE): + IE_NAME = 'arte.tv:magazine' + _VALID_URL = r'https?://(?:www\.)?arte\.tv/magazine/[^/]+/(?Pfr|de|en|es)/(?P[^/?#&]+)' + + _TESTS = [{ + # Embedded via