_ Git - youtube-dl/blob - youtube_dl/extractor/azmedien.py

   1 from __future__ import unicode_literals
   2
   3 import re
   4
   5 from .common import InfoExtractor
   6 from .kaltura import KalturaIE
   7 from ..utils import (
   8     get_element_by_id,
   9     strip_or_none,
  10     urljoin,
  11 )
  12
  13
  14 class AZMedienBaseIE(InfoExtractor):
  15     def _kaltura_video(self, partner_id, entry_id):
  16         return self.url_result(
  17             'kaltura:%s:%s' % (partner_id, entry_id), ie=KalturaIE.ie_key(),
  18             video_id=entry_id)
  19
  20
  21 class AZMedienIE(AZMedienBaseIE):
  22     IE_DESC = 'AZ Medien videos'
  23     _VALID_URL = r'''(?x)
  24                     https?://
  25                         (?:www\.)?
  26                         (?:
  27                             telezueri\.ch|
  28                             telebaern\.tv|
  29                             telem1\.ch
  30                         )/
  31                         [0-9]+-show-[^/\#]+
  32                         (?:
  33                             /[0-9]+-episode-[^/\#]+
  34                             (?:
  35                                 /[0-9]+-segment-(?:[^/\#]+\#)?|
  36                                 \#
  37                             )|
  38                             \#
  39                         )
  40                         (?P<id>[^\#]+)
  41                     '''
  42
  43     _TESTS = [{
  44         # URL with 'segment'
  45         'url': 'http://www.telezueri.ch/62-show-zuerinews/13772-episode-sonntag-18-dezember-2016/32419-segment-massenabweisungen-beim-hiltl-club-wegen-pelzboom',
  46         'info_dict': {
  47             'id': '1_2444peh4',
  48             'ext': 'mov',
  49             'title': 'Massenabweisungen beim Hiltl Club wegen Pelzboom',
  50             'description': 'md5:9ea9dd1b159ad65b36ddcf7f0d7c76a8',
  51             'uploader_id': 'TeleZ?ri',
  52             'upload_date': '20161218',
  53             'timestamp': 1482084490,
  54         },
  55         'params': {
  56             'skip_download': True,
  57         },
  58     }, {
  59         # URL with 'segment' and fragment:
  60         'url': 'http://www.telebaern.tv/118-show-news/14240-episode-dienstag-17-januar-2017/33666-segment-achtung-gefahr#zu-wenig-pflegerinnen-und-pfleger',
  61         'only_matching': True
  62     }, {
  63         # URL with 'episode' and fragment:
  64         'url': 'http://www.telem1.ch/47-show-sonntalk/13986-episode-soldaten-fuer-grenzschutz-energiestrategie-obama-bilanz#soldaten-fuer-grenzschutz-energiestrategie-obama-bilanz',
  65         'only_matching': True
  66     }, {
  67         # URL with 'show' and fragment:
  68         'url': 'http://www.telezueri.ch/66-show-sonntalk#burka-plakate-trump-putin-china-besuch',
  69         'only_matching': True
  70     }]
  71
  72     def _real_extract(self, url):
  73         video_id = self._match_id(url)
  74
  75         webpage = self._download_webpage(url, video_id)
  76
  77         partner_id = self._search_regex(
  78             r'<script[^>]+src=["\'](?:https?:)?//(?:[^/]+\.)?kaltura\.com(?:/[^/]+)*/(?:p|partner_id)/([0-9]+)',
  79             webpage, 'kaltura partner id')
  80         entry_id = self._html_search_regex(
  81             r'<a[^>]+data-id=(["\'])(?P<id>(?:(?!\1).)+)\1[^>]+data-slug=["\']%s'
  82             % re.escape(video_id), webpage, 'kaltura entry id', group='id')
  83
  84         return self._kaltura_video(partner_id, entry_id)
  85
  86
  87 class AZMedienPlaylistIE(AZMedienBaseIE):
  88     IE_DESC = 'AZ Medien playlists'
  89     _VALID_URL = r'''(?x)
  90                     https?://
  91                         (?:www\.)?
  92                         (?:
  93                             telezueri\.ch|
  94                             telebaern\.tv|
  95                             telem1\.ch
  96                         )/
  97                         (?P<id>[0-9]+-
  98                             (?:
  99                                 show|
 100                                 topic|
 101                                 themen
 102                             )-[^/\#]+
 103                             (?:
 104                                 /[0-9]+-episode-[^/\#]+
 105                             )?
 106                         )$
 107                     '''
 108
 109     _TESTS = [{
 110         # URL with 'episode'
 111         'url': 'http://www.telebaern.tv/118-show-news/13735-episode-donnerstag-15-dezember-2016',
 112         'info_dict': {
 113             'id': '118-show-news/13735-episode-donnerstag-15-dezember-2016',
 114             'title': 'News - Donnerstag, 15. Dezember 2016',
 115         },
 116         'playlist_count': 9,
 117     }, {
 118         # URL with 'themen'
 119         'url': 'http://www.telem1.ch/258-themen-tele-m1-classics',
 120         'info_dict': {
 121             'id': '258-themen-tele-m1-classics',
 122             'title': 'Tele M1 Classics',
 123         },
 124         'playlist_mincount': 15,
 125     }, {
 126         # URL with 'topic', contains nested playlists
 127         'url': 'http://www.telezueri.ch/219-topic-aera-trump-hat-offiziell-begonnen',
 128         'only_matching': True,
 129     }, {
 130         # URL with 'show' only
 131         'url': 'http://www.telezueri.ch/86-show-talktaeglich',
 132         'only_matching': True
 133     }]
 134
 135     def _real_extract(self, url):
 136         show_id = self._match_id(url)
 137         webpage = self._download_webpage(url, show_id)
 138
 139         entries = []
 140
 141         partner_id = self._search_regex(
 142             r'src=["\'](?:https?:)?//(?:[^/]+\.)kaltura\.com/(?:[^/]+/)*(?:p|partner_id)/(\d+)',
 143             webpage, 'kaltura partner id', default=None)
 144
 145         if partner_id:
 146             entries = [
 147                 self._kaltura_video(partner_id, m.group('id'))
 148                 for m in re.finditer(
 149                     r'data-id=(["\'])(?P<id>(?:(?!\1).)+)\1', webpage)]
 150
 151         if not entries:
 152             entries = [
 153                 self.url_result(m.group('url'), ie=AZMedienIE.ie_key())
 154                 for m in re.finditer(
 155                     r'<a[^>]+data-real=(["\'])(?P<url>http.+?)\1', webpage)]
 156
 157         if not entries:
 158             entries = [
 159                 # May contain nested playlists (e.g. [1]) thus no explicit
 160                 # ie_key
 161                 # 1. http://www.telezueri.ch/219-topic-aera-trump-hat-offiziell-begonnen)
 162                 self.url_result(urljoin(url, m.group('url')))
 163                 for m in re.finditer(
 164                     r'<a[^>]+name=[^>]+href=(["\'])(?P<url>/.+?)\1', webpage)]
 165
 166         title = self._search_regex(
 167             r'episodeShareTitle\s*=\s*(["\'])(?P<title>(?:(?!\1).)+)\1',
 168             webpage, 'title',
 169             default=strip_or_none(get_element_by_id(
 170                 'video-title', webpage)), group='title')
 171
 172         return self.playlist_result(entries, show_id, title)