X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Farte.py;h=979481b2198134859bfbdf79302e189c4ad363fc;hb=a9c2896e2252839c2e4801189f10acce7ff6413e;hp=b35a679e3b036d2c573a4f1fc85d53bd793f745b;hpb=6c5ad80cdcd3f51b61a9d21c55e21d51e6b2f39a;p=youtube-dl diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index b35a679e3..979481b21 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -1,7 +1,8 @@ # encoding: utf-8 +from __future__ import unicode_literals + import re import json -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -11,6 +12,7 @@ from ..utils import ( determine_ext, get_element_by_id, compat_str, + get_element_by_attribute, ) # There are different sources of video in arte.tv, the extraction process @@ -18,11 +20,11 @@ from ..utils import ( # add tests. class ArteTvIE(InfoExtractor): - _VIDEOS_URL = r'(?:http://)?videos.arte.tv/(?Pfr|de)/.*-(?P.*?).html' - _LIVEWEB_URL = r'(?:http://)?liveweb.arte.tv/(?Pfr|de)/(?P.+?)/(?P.+)' + _VIDEOS_URL = r'(?:http://)?videos\.arte\.tv/(?Pfr|de)/.*-(?P.*?)\.html' + _LIVEWEB_URL = r'(?:http://)?liveweb\.arte\.tv/(?Pfr|de)/(?P.+?)/(?P.+)' _LIVE_URL = r'index-[0-9]+\.html$' - IE_NAME = u'arte.tv' + IE_NAME = 'arte.tv' @classmethod def suitable(cls, url): @@ -37,7 +39,7 @@ class ArteTvIE(InfoExtractor): # r'src="(.*?/videothek_js.*?\.js)', # 0, # [ - # (1, 'url', u'Invalid URL: %s' % url) + # (1, 'url', 'Invalid URL: %s' % url) # ] # ) # http_host = url.split('/')[2] @@ -49,12 +51,12 @@ class ArteTvIE(InfoExtractor): # '(rtmp://.*?)\'', # re.DOTALL, # [ - # (1, 'path', u'could not extract video path: %s' % url), - # (2, 'player', u'could not extract video player: %s' % url), - # (3, 'url', u'could not extract video url: %s' % url) + # (1, 'path', 'could not extract video path: %s' % url), + # (2, 'player', 'could not extract video player: %s' % url), + # (3, 'url', 'could not extract video url: %s' % url) # ] # ) - # video_url = u'%s/%s' % (info.get('url'), info.get('path')) + # video_url = '%s/%s' % (info.get('url'), info.get('path')) def _real_extract(self, url): mobj = re.match(self._VIDEOS_URL, url) @@ -69,20 +71,23 @@ class ArteTvIE(InfoExtractor): lang = mobj.group('lang') return self._extract_liveweb(url, name, lang) - if re.search(self._LIVE_URL, video_id) is not None: - raise ExtractorError(u'Arte live streams are not yet supported, sorry') + if re.search(self._LIVE_URL, url) is not None: + raise ExtractorError('Arte live streams are not yet supported, sorry') # self.extractLiveStream(url) # return + raise ExtractorError('No video found') + def _extract_video(self, url, video_id, lang): """Extract from videos.arte.tv""" ref_xml_url = url.replace('/videos/', '/do_delegate/videos/') ref_xml_url = ref_xml_url.replace('.html', ',view,asPlayerXml.xml') - ref_xml = self._download_webpage(ref_xml_url, video_id, note=u'Downloading metadata') - ref_xml_doc = xml.etree.ElementTree.fromstring(ref_xml) + ref_xml_doc = self._download_xml( + ref_xml_url, video_id, note='Downloading metadata') config_node = find_xpath_attr(ref_xml_doc, './/video', 'lang', lang) config_xml_url = config_node.attrib['ref'] - config_xml = self._download_webpage(config_xml_url, video_id, note=u'Downloading configuration') + config_xml = self._download_webpage( + config_xml_url, video_id, note='Downloading configuration') video_urls = list(re.finditer(r'(?P.*?)', config_xml)) def _key(m): @@ -108,14 +113,13 @@ class ArteTvIE(InfoExtractor): def _extract_liveweb(self, url, name, lang): """Extract form http://liveweb.arte.tv/""" webpage = self._download_webpage(url, name) - video_id = self._search_regex(r'eventId=(\d+?)("|&)', webpage, u'event id') - config_xml = self._download_webpage('http://download.liveweb.arte.tv/o21/liveweb/events/event-%s.xml' % video_id, - video_id, u'Downloading information') - config_doc = xml.etree.ElementTree.fromstring(config_xml.encode('utf-8')) + video_id = self._search_regex(r'eventId=(\d+?)("|&)', webpage, 'event id') + config_doc = self._download_xml('http://download.liveweb.arte.tv/o21/liveweb/events/event-%s.xml' % video_id, + video_id, 'Downloading information') event_doc = config_doc.find('event') url_node = event_doc.find('video').find('urlHd') if url_node is None: - url_node = video_doc.find('urlSd') + url_node = event_doc.find('urlSd') return {'id': video_id, 'title': event_doc.find('name%s' % lang.capitalize()).text, @@ -126,8 +130,8 @@ class ArteTvIE(InfoExtractor): class ArteTVPlus7IE(InfoExtractor): - IE_NAME = u'arte.tv:+7' - _VALID_URL = r'https?://www\.arte.tv/guide/(?Pfr|de)/(?:(?:sendungen|emissions)/)?(?P.*?)/(?P.*?)(\?.*)?' + IE_NAME = 'arte.tv:+7' + _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?Pfr|de)/(?:(?:sendungen|emissions)/)?(?P.*?)/(?P.*?)(\?.*)?' @classmethod def _extract_url_info(cls, url): @@ -145,7 +149,9 @@ class ArteTVPlus7IE(InfoExtractor): def _extract_from_webpage(self, webpage, video_id, lang): json_url = self._html_search_regex(r'arte_vp_url="(.*?)"', webpage, 'json url') + return self._extract_from_json_url(json_url, video_id, lang) + def _extract_from_json_url(self, json_url, video_id, lang): json_info = self._download_webpage(json_url, video_id, 'Downloading info json') self.report_extraction(video_id) info = json.loads(json_info) @@ -196,6 +202,8 @@ class ArteTVPlus7IE(InfoExtractor): re.match(r'VO-ST(F|A)', f.get('versionCode', '')) is None, # The version with sourds/mal subtitles has also lower relevance re.match(r'VO?(F|A)-STM\1', f.get('versionCode', '')) is None, + # Prefer http downloads over m3u8 + 0 if f['url'].endswith('m3u8') else 1, ) formats = sorted(formats, key=sort_key) def _format(format_info): @@ -207,7 +215,7 @@ class ArteTVPlus7IE(InfoExtractor): if bitrate is not None: quality += '-%d' % bitrate if format_info.get('versionCode') is not None: - format_id = u'%s-%s' % (quality, format_info['versionCode']) + format_id = '%s-%s' % (quality, format_info['versionCode']) else: format_id = quality info = { @@ -216,7 +224,7 @@ class ArteTVPlus7IE(InfoExtractor): 'width': format_info.get('width'), 'height': height, } - if format_info['mediaType'] == u'rtmp': + if format_info['mediaType'] == 'rtmp': info['url'] = format_info['streamer'] info['play_path'] = 'mp4:' + format_info['url'] info['ext'] = 'flv' @@ -231,27 +239,29 @@ class ArteTVPlus7IE(InfoExtractor): # It also uses the arte_vp_url url from the webpage to extract the information class ArteTVCreativeIE(ArteTVPlus7IE): - IE_NAME = u'arte.tv:creative' + IE_NAME = 'arte.tv:creative' _VALID_URL = r'https?://creative\.arte\.tv/(?Pfr|de)/magazine?/(?P.+)' _TEST = { - u'url': u'http://creative.arte.tv/de/magazin/agentur-amateur-corporate-design', - u'file': u'050489-002.mp4', - u'info_dict': { - u'title': u'Agentur Amateur / Agence Amateur #2 : Corporate Design', + 'url': 'http://creative.arte.tv/de/magazin/agentur-amateur-corporate-design', + 'info_dict': { + 'id': '050489-002', + 'ext': 'mp4', + 'title': 'Agentur Amateur / Agence Amateur #2 : Corporate Design', }, } class ArteTVFutureIE(ArteTVPlus7IE): - IE_NAME = u'arte.tv:future' + IE_NAME = 'arte.tv:future' _VALID_URL = r'https?://future\.arte\.tv/(?Pfr|de)/(thema|sujet)/.*?#article-anchor-(?P\d+)' _TEST = { - u'url': u'http://future.arte.tv/fr/sujet/info-sciences#article-anchor-7081', - u'file': u'050940-003.mp4', - u'info_dict': { - u'title': u'Les champignons au secours de la planète', + 'url': 'http://future.arte.tv/fr/sujet/info-sciences#article-anchor-7081', + 'info_dict': { + 'id': '050940-003', + 'ext': 'mp4', + 'title': 'Les champignons au secours de la planète', }, } @@ -260,3 +270,38 @@ class ArteTVFutureIE(ArteTVPlus7IE): webpage = self._download_webpage(url, anchor_id) row = get_element_by_id(anchor_id, webpage) return self._extract_from_webpage(row, anchor_id, lang) + + +class ArteTVDDCIE(ArteTVPlus7IE): + IE_NAME = 'arte.tv:ddc' + _VALID_URL = r'https?://ddc\.arte\.tv/(?Pemission|folge)/(?P.+)' + + def _real_extract(self, url): + video_id, lang = self._extract_url_info(url) + if lang == 'folge': + lang = 'de' + elif lang == 'emission': + lang = 'fr' + webpage = self._download_webpage(url, video_id) + scriptElement = get_element_by_attribute('class', 'visu_video_block', webpage) + script_url = self._html_search_regex(r'src="(.*?)"', scriptElement, 'script url') + javascriptPlayerGenerator = self._download_webpage(script_url, video_id, 'Download javascript player generator') + json_url = self._search_regex(r"json_url=(.*)&rendering_place.*", javascriptPlayerGenerator, 'json url') + return self._extract_from_json_url(json_url, video_id, lang) + + +class ArteTVConcertIE(ArteTVPlus7IE): + IE_NAME = 'arte.tv:concert' + _VALID_URL = r'https?://concert\.arte\.tv/(?Pde|fr)/(?P.+)' + + _TEST = { + 'url': 'http://concert.arte.tv/de/notwist-im-pariser-konzertclub-divan-du-monde', + 'md5': '9ea035b7bd69696b67aa2ccaaa218161', + 'info_dict': { + 'id': '186', + 'ext': 'mp4', + 'title': 'The Notwist im Pariser Konzertclub "Divan du Monde"', + 'upload_date': '20140128', + 'description': 'md5:486eb08f991552ade77439fe6d82c305', + }, + }