_ Git - youtube-dl/blob - youtube_dl/extractor/arte.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import re
   5
   6 from .common import InfoExtractor
   7 from ..compat import (
   8     compat_parse_qs,
   9     compat_urllib_parse_urlparse,
  10 )
  11 from ..utils import (
  12     ExtractorError,
  13     find_xpath_attr,
  14     get_element_by_attribute,
  15     int_or_none,
  16     NO_DEFAULT,
  17     qualities,
  18     unified_strdate,
  19 )
  20
  21 # There are different sources of video in arte.tv, the extraction process
  22 # is different for each one. The videos usually expire in 7 days, so we can't
  23 # add tests.
  24
  25
  26 class ArteTvIE(InfoExtractor):
  27     _VALID_URL = r'https?://videos\.arte\.tv/(?P<lang>fr|de|en|es)/.*-(?P<id>.*?)\.html'
  28     IE_NAME = 'arte.tv'
  29
  30     def _real_extract(self, url):
  31         mobj = re.match(self._VALID_URL, url)
  32         lang = mobj.group('lang')
  33         video_id = mobj.group('id')
  34
  35         ref_xml_url = url.replace('/videos/', '/do_delegate/videos/')
  36         ref_xml_url = ref_xml_url.replace('.html', ',view,asPlayerXml.xml')
  37         ref_xml_doc = self._download_xml(
  38             ref_xml_url, video_id, note='Downloading metadata')
  39         config_node = find_xpath_attr(ref_xml_doc, './/video', 'lang', lang)
  40         config_xml_url = config_node.attrib['ref']
  41         config = self._download_xml(
  42             config_xml_url, video_id, note='Downloading configuration')
  43
  44         formats = [{
  45             'format_id': q.attrib['quality'],
  46             # The playpath starts at 'mp4:', if we don't manually
  47             # split the url, rtmpdump will incorrectly parse them
  48             'url': q.text.split('mp4:', 1)[0],
  49             'play_path': 'mp4:' + q.text.split('mp4:', 1)[1],
  50             'ext': 'flv',
  51             'quality': 2 if q.attrib['quality'] == 'hd' else 1,
  52         } for q in config.findall('./urls/url')]
  53         self._sort_formats(formats)
  54
  55         title = config.find('.//name').text
  56         thumbnail = config.find('.//firstThumbnailUrl').text
  57         return {
  58             'id': video_id,
  59             'title': title,
  60             'thumbnail': thumbnail,
  61             'formats': formats,
  62         }
  63
  64
  65 class ArteTVBaseIE(InfoExtractor):
  66     @classmethod
  67     def _extract_url_info(cls, url):
  68         mobj = re.match(cls._VALID_URL, url)
  69         lang = mobj.group('lang')
  70         query = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
  71         if 'vid' in query:
  72             video_id = query['vid'][0]
  73         else:
  74             # This is not a real id, it can be for example AJT for the news
  75             # http://www.arte.tv/guide/fr/emissions/AJT/arte-journal
  76             video_id = mobj.group('id')
  77         return video_id, lang
  78
  79     def _extract_from_json_url(self, json_url, video_id, lang, title=None):
  80         info = self._download_json(json_url, video_id)
  81         player_info = info['videoJsonPlayer']
  82
  83         vsr = player_info['VSR']
  84
  85         if not vsr and not player_info.get('VRU'):
  86             raise ExtractorError(
  87                 'Video %s is not available' % player_info.get('VID') or video_id,
  88                 expected=True)
  89
  90         upload_date_str = player_info.get('shootingDate')
  91         if not upload_date_str:
  92             upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0]
  93
  94         title = (player_info.get('VTI') or title or player_info['VID']).strip()
  95         subtitle = player_info.get('VSU', '').strip()
  96         if subtitle:
  97             title += ' - %s' % subtitle
  98
  99         info_dict = {
 100             'id': player_info['VID'],
 101             'title': title,
 102             'description': player_info.get('VDE'),
 103             'upload_date': unified_strdate(upload_date_str),
 104             'thumbnail': player_info.get('programImage') or player_info.get('VTU', {}).get('IUR'),
 105         }
 106         qfunc = qualities(['HQ', 'MQ', 'EQ', 'SQ'])
 107
 108         LANGS = {
 109             'fr': 'F',
 110             'de': 'A',
 111             'en': 'E[ANG]',
 112             'es': 'E[ESP]',
 113         }
 114
 115         langcode = LANGS.get(lang, lang)
 116
 117         formats = []
 118         for format_id, format_dict in vsr.items():
 119             f = dict(format_dict)
 120             versionCode = f.get('versionCode')
 121             l = re.escape(langcode)
 122
 123             # Language preference from most to least priority
 124             # Reference: section 5.6.3 of
 125             # http://www.arte.tv/sites/en/corporate/files/complete-technical-guidelines-arte-geie-v1-05.pdf
 126             PREFERENCES = (
 127                 # original version in requested language, without subtitles
 128                 r'VO{0}$'.format(l),
 129                 # original version in requested language, with partial subtitles in requested language
 130                 r'VO{0}-ST{0}$'.format(l),
 131                 # original version in requested language, with subtitles for the deaf and hard-of-hearing in requested language
 132                 r'VO{0}-STM{0}$'.format(l),
 133                 # non-original (dubbed) version in requested language, without subtitles
 134                 r'V{0}$'.format(l),
 135                 # non-original (dubbed) version in requested language, with subtitles partial subtitles in requested language
 136                 r'V{0}-ST{0}$'.format(l),
 137                 # non-original (dubbed) version in requested language, with subtitles for the deaf and hard-of-hearing in requested language
 138                 r'V{0}-STM{0}$'.format(l),
 139                 # original version in requested language, with partial subtitles in different language
 140                 r'VO{0}-ST(?!{0}).+?$'.format(l),
 141                 # original version in requested language, with subtitles for the deaf and hard-of-hearing in different language
 142                 r'VO{0}-STM(?!{0}).+?$'.format(l),
 143                 # original version in different language, with partial subtitles in requested language
 144                 r'VO(?:(?!{0}).+?)?-ST{0}$'.format(l),
 145                 # original version in different language, with subtitles for the deaf and hard-of-hearing in requested language
 146                 r'VO(?:(?!{0}).+?)?-STM{0}$'.format(l),
 147                 # original version in different language, without subtitles
 148                 r'VO(?:(?!{0}))?$'.format(l),
 149                 # original version in different language, with partial subtitles in different language
 150                 r'VO(?:(?!{0}).+?)?-ST(?!{0}).+?$'.format(l),
 151                 # original version in different language, with subtitles for the deaf and hard-of-hearing in different language
 152                 r'VO(?:(?!{0}).+?)?-STM(?!{0}).+?$'.format(l),
 153             )
 154
 155             for pref, p in enumerate(PREFERENCES):
 156                 if re.match(p, versionCode):
 157                     lang_pref = len(PREFERENCES) - pref
 158                     break
 159             else:
 160                 lang_pref = -1
 161
 162             format = {
 163                 'format_id': format_id,
 164                 'preference': -10 if f.get('videoFormat') == 'M3U8' else None,
 165                 'language_preference': lang_pref,
 166                 'format_note': '%s, %s' % (f.get('versionCode'), f.get('versionLibelle')),
 167                 'width': int_or_none(f.get('width')),
 168                 'height': int_or_none(f.get('height')),
 169                 'tbr': int_or_none(f.get('bitrate')),
 170                 'quality': qfunc(f.get('quality')),
 171             }
 172
 173             if f.get('mediaType') == 'rtmp':
 174                 format['url'] = f['streamer']
 175                 format['play_path'] = 'mp4:' + f['url']
 176                 format['ext'] = 'flv'
 177             else:
 178                 format['url'] = f['url']
 179
 180             formats.append(format)
 181
 182         self._check_formats(formats, video_id)
 183         self._sort_formats(formats)
 184
 185         info_dict['formats'] = formats
 186         return info_dict
 187
 188
 189 class ArteTVPlus7IE(ArteTVBaseIE):
 190     IE_NAME = 'arte.tv:+7'
 191     _VALID_URL = r'https?://(?:(?:www|sites)\.)?arte\.tv/(?:[^/]+/)?(?P<lang>fr|de|en|es)/(?:videos/)?(?:[^/]+/)*(?P<id>[^/?#&]+)'
 192
 193     _TESTS = [{
 194         'url': 'http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D',
 195         'only_matching': True,
 196     }, {
 197         'url': 'http://sites.arte.tv/karambolage/de/video/karambolage-22',
 198         'only_matching': True,
 199     }, {
 200         'url': 'http://www.arte.tv/de/videos/048696-000-A/der-kluge-bauch-unser-zweites-gehirn',
 201         'only_matching': True,
 202     }]
 203
 204     @classmethod
 205     def suitable(cls, url):
 206         return False if ArteTVPlaylistIE.suitable(url) else super(ArteTVPlus7IE, cls).suitable(url)
 207
 208     def _real_extract(self, url):
 209         video_id, lang = self._extract_url_info(url)
 210         webpage = self._download_webpage(url, video_id)
 211         return self._extract_from_webpage(webpage, video_id, lang)
 212
 213     def _extract_from_webpage(self, webpage, video_id, lang):
 214         patterns_templates = (r'arte_vp_url=["\'](.*?%s.*?)["\']', r'data-url=["\']([^"]+%s[^"]+)["\']')
 215         ids = (video_id, '')
 216         # some pages contain multiple videos (like
 217         # http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D),
 218         # so we first try to look for json URLs that contain the video id from
 219         # the 'vid' parameter.
 220         patterns = [t % re.escape(_id) for _id in ids for t in patterns_templates]
 221         json_url = self._html_search_regex(
 222             patterns, webpage, 'json vp url', default=None)
 223         if not json_url:
 224             def find_iframe_url(webpage, default=NO_DEFAULT):
 225                 return self._html_search_regex(
 226                     r'<iframe[^>]+src=(["\'])(?P<url>.+\bjson_url=.+?)\1',
 227                     webpage, 'iframe url', group='url', default=default)
 228
 229             iframe_url = find_iframe_url(webpage, None)
 230             if not iframe_url:
 231                 embed_url = self._html_search_regex(
 232                     r'arte_vp_url_oembed=\'([^\']+?)\'', webpage, 'embed url', default=None)
 233                 if embed_url:
 234                     player = self._download_json(
 235                         embed_url, video_id, 'Downloading player page')
 236                     iframe_url = find_iframe_url(player['html'])
 237             # en and es URLs produce react-based pages with different layout (e.g.
 238             # http://www.arte.tv/guide/en/053330-002-A/carnival-italy?zone=world)
 239             if not iframe_url:
 240                 program = self._search_regex(
 241                     r'program\s*:\s*({.+?["\']embed_html["\'].+?}),?\s*\n',
 242                     webpage, 'program', default=None)
 243                 if program:
 244                     embed_html = self._parse_json(program, video_id)
 245                     if embed_html:
 246                         iframe_url = find_iframe_url(embed_html['embed_html'])
 247             if iframe_url:
 248                 json_url = compat_parse_qs(
 249                     compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0]
 250         if json_url:
 251             title = self._search_regex(
 252                 r'<h3[^>]+title=(["\'])(?P<title>.+?)\1',
 253                 webpage, 'title', default=None, group='title')
 254             return self._extract_from_json_url(json_url, video_id, lang, title=title)
 255         # Different kind of embed URL (e.g.
 256         # http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium)
 257         entries = [
 258             self.url_result(url)
 259             for _, url in re.findall(r'<iframe[^>]+src=(["\'])(?P<url>.+?)\1', webpage)]
 260         return self.playlist_result(entries)
 261
 262
 263 # It also uses the arte_vp_url url from the webpage to extract the information
 264 class ArteTVCreativeIE(ArteTVPlus7IE):
 265     IE_NAME = 'arte.tv:creative'
 266     _VALID_URL = r'https?://creative\.arte\.tv/(?P<lang>fr|de|en|es)/(?:[^/]+/)*(?P<id>[^/?#&]+)'
 267
 268     _TESTS = [{
 269         'url': 'http://creative.arte.tv/fr/episode/osmosis-episode-1',
 270         'info_dict': {
 271             'id': '057405-001-A',
 272             'ext': 'mp4',
 273             'title': 'OSMOSIS - N\'AYEZ PLUS PEUR D\'AIMER (1)',
 274             'upload_date': '20150716',
 275         },
 276     }, {
 277         'url': 'http://creative.arte.tv/fr/Monty-Python-Reunion',
 278         'playlist_count': 11,
 279         'add_ie': ['Youtube'],
 280     }, {
 281         'url': 'http://creative.arte.tv/de/episode/agentur-amateur-4-der-erste-kunde',
 282         'only_matching': True,
 283     }]
 284
 285
 286 class ArteTVInfoIE(ArteTVPlus7IE):
 287     IE_NAME = 'arte.tv:info'
 288     _VALID_URL = r'https?://info\.arte\.tv/(?P<lang>fr|de|en|es)/(?:[^/]+/)*(?P<id>[^/?#&]+)'
 289
 290     _TESTS = [{
 291         'url': 'http://info.arte.tv/fr/service-civique-un-cache-misere',
 292         'info_dict': {
 293             'id': '067528-000-A',
 294             'ext': 'mp4',
 295             'title': 'Service civique, un cache misère ?',
 296             'upload_date': '20160403',
 297         },
 298     }]
 299
 300
 301 class ArteTVFutureIE(ArteTVPlus7IE):
 302     IE_NAME = 'arte.tv:future'
 303     _VALID_URL = r'https?://future\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>[^/?#&]+)'
 304
 305     _TESTS = [{
 306         'url': 'http://future.arte.tv/fr/info-sciences/les-ecrevisses-aussi-sont-anxieuses',
 307         'info_dict': {
 308             'id': '050940-028-A',
 309             'ext': 'mp4',
 310             'title': 'Les écrevisses aussi peuvent être anxieuses',
 311             'upload_date': '20140902',
 312         },
 313     }, {
 314         'url': 'http://future.arte.tv/fr/la-science-est-elle-responsable',
 315         'only_matching': True,
 316     }]
 317
 318
 319 class ArteTVDDCIE(ArteTVPlus7IE):
 320     IE_NAME = 'arte.tv:ddc'
 321     _VALID_URL = r'https?://ddc\.arte\.tv/(?P<lang>emission|folge)/(?P<id>[^/?#&]+)'
 322
 323     _TESTS = []
 324
 325     def _real_extract(self, url):
 326         video_id, lang = self._extract_url_info(url)
 327         if lang == 'folge':
 328             lang = 'de'
 329         elif lang == 'emission':
 330             lang = 'fr'
 331         webpage = self._download_webpage(url, video_id)
 332         scriptElement = get_element_by_attribute('class', 'visu_video_block', webpage)
 333         script_url = self._html_search_regex(r'src="(.*?)"', scriptElement, 'script url')
 334         javascriptPlayerGenerator = self._download_webpage(script_url, video_id, 'Download javascript player generator')
 335         json_url = self._search_regex(r"json_url=(.*)&rendering_place.*", javascriptPlayerGenerator, 'json url')
 336         return self._extract_from_json_url(json_url, video_id, lang)
 337
 338
 339 class ArteTVConcertIE(ArteTVPlus7IE):
 340     IE_NAME = 'arte.tv:concert'
 341     _VALID_URL = r'https?://concert\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>[^/?#&]+)'
 342
 343     _TESTS = [{
 344         'url': 'http://concert.arte.tv/de/notwist-im-pariser-konzertclub-divan-du-monde',
 345         'md5': '9ea035b7bd69696b67aa2ccaaa218161',
 346         'info_dict': {
 347             'id': '186',
 348             'ext': 'mp4',
 349             'title': 'The Notwist im Pariser Konzertclub "Divan du Monde"',
 350             'upload_date': '20140128',
 351             'description': 'md5:486eb08f991552ade77439fe6d82c305',
 352         },
 353     }]
 354
 355
 356 class ArteTVCinemaIE(ArteTVPlus7IE):
 357     IE_NAME = 'arte.tv:cinema'
 358     _VALID_URL = r'https?://cinema\.arte\.tv/(?P<lang>fr|de|en|es)/(?P<id>.+)'
 359
 360     _TESTS = [{
 361         'url': 'http://cinema.arte.tv/fr/article/les-ailes-du-desir-de-julia-reck',
 362         'md5': 'a5b9dd5575a11d93daf0e3f404f45438',
 363         'info_dict': {
 364             'id': '062494-000-A',
 365             'ext': 'mp4',
 366             'title': 'Film lauréat du concours web - "Les ailes du désir" de Julia Reck',
 367             'upload_date': '20150807',
 368         },
 369     }]
 370
 371
 372 class ArteTVMagazineIE(ArteTVPlus7IE):
 373     IE_NAME = 'arte.tv:magazine'
 374     _VALID_URL = r'https?://(?:www\.)?arte\.tv/magazine/[^/]+/(?P<lang>fr|de|en|es)/(?P<id>[^/?#&]+)'
 375
 376     _TESTS = [{
 377         # Embedded via <iframe src="http://www.arte.tv/arte_vp/index.php?json_url=..."
 378         'url': 'http://www.arte.tv/magazine/trepalium/fr/entretien-avec-le-realisateur-vincent-lannoo-trepalium',
 379         'md5': '2a9369bcccf847d1c741e51416299f25',
 380         'info_dict': {
 381             'id': '065965-000-A',
 382             'ext': 'mp4',
 383             'title': 'Trepalium - Extrait Ep.01',
 384             'upload_date': '20160121',
 385         },
 386     }, {
 387         # Embedded via <iframe src="http://www.arte.tv/guide/fr/embed/054813-004-A/medium"
 388         'url': 'http://www.arte.tv/magazine/trepalium/fr/episode-0406-replay-trepalium',
 389         'md5': 'fedc64fc7a946110fe311634e79782ca',
 390         'info_dict': {
 391             'id': '054813-004_PLUS7-F',
 392             'ext': 'mp4',
 393             'title': 'Trepalium (4/6)',
 394             'description': 'md5:10057003c34d54e95350be4f9b05cb40',
 395             'upload_date': '20160218',
 396         },
 397     }, {
 398         'url': 'http://www.arte.tv/magazine/metropolis/de/frank-woeste-german-paris-metropolis',
 399         'only_matching': True,
 400     }]
 401
 402
 403 class ArteTVEmbedIE(ArteTVPlus7IE):
 404     IE_NAME = 'arte.tv:embed'
 405     _VALID_URL = r'''(?x)
 406         http://www\.arte\.tv
 407         /(?:playerv2/embed|arte_vp/index)\.php\?json_url=
 408         (?P<json_url>
 409             http://arte\.tv/papi/tvguide/videos/stream/player/
 410             (?P<lang>[^/]+)/(?P<id>[^/]+)[^&]*
 411         )
 412     '''
 413
 414     _TESTS = []
 415
 416     def _real_extract(self, url):
 417         mobj = re.match(self._VALID_URL, url)
 418         video_id = mobj.group('id')
 419         lang = mobj.group('lang')
 420         json_url = mobj.group('json_url')
 421         return self._extract_from_json_url(json_url, video_id, lang)
 422
 423
 424 class TheOperaPlatformIE(ArteTVPlus7IE):
 425     IE_NAME = 'theoperaplatform'
 426     _VALID_URL = r'https?://(?:www\.)?theoperaplatform\.eu/(?P<lang>fr|de|en|es)/(?P<id>[^/?#&]+)'
 427
 428     _TESTS = [{
 429         'url': 'http://www.theoperaplatform.eu/de/opera/verdi-otello',
 430         'md5': '970655901fa2e82e04c00b955e9afe7b',
 431         'info_dict': {
 432             'id': '060338-009-A',
 433             'ext': 'mp4',
 434             'title': 'Verdi - OTELLO',
 435             'upload_date': '20160927',
 436         },
 437     }]
 438
 439
 440 class ArteTVPlaylistIE(ArteTVBaseIE):
 441     IE_NAME = 'arte.tv:playlist'
 442     _VALID_URL = r'https?://(?:www\.)?arte\.tv/guide/(?P<lang>fr|de|en|es)/[^#]*#collection/(?P<id>PL-\d+)'
 443
 444     _TESTS = [{
 445         'url': 'http://www.arte.tv/guide/de/plus7/?country=DE#collection/PL-013263/ARTETV',
 446         'info_dict': {
 447             'id': 'PL-013263',
 448             'title': 'Areva & Uramin',
 449             'description': 'md5:a1dc0312ce357c262259139cfd48c9bf',
 450         },
 451         'playlist_mincount': 6,
 452     }, {
 453         'url': 'http://www.arte.tv/guide/de/playlists?country=DE#collection/PL-013190/ARTETV',
 454         'only_matching': True,
 455     }]
 456
 457     def _real_extract(self, url):
 458         playlist_id, lang = self._extract_url_info(url)
 459         collection = self._download_json(
 460             'https://api.arte.tv/api/player/v1/collectionData/%s/%s?source=videos'
 461             % (lang, playlist_id), playlist_id)
 462         title = collection.get('title')
 463         description = collection.get('shortDescription') or collection.get('teaserText')
 464         entries = [
 465             self._extract_from_json_url(
 466                 video['jsonUrl'], video.get('programId') or playlist_id, lang)
 467             for video in collection['videos'] if video.get('jsonUrl')]
 468         return self.playlist_result(entries, playlist_id, title, description)