[mitele] Extract series metadata and make title more robust (Closes #9758)
[youtube-dl] / youtube_dl / extractor / mitele.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import re
5
6 from .common import InfoExtractor
7 from ..compat import (
8     compat_urllib_parse_urlencode,
9     compat_urlparse,
10 )
11 from ..utils import (
12     get_element_by_attribute,
13     int_or_none,
14     remove_start,
15 )
16
17
18 class MiTeleIE(InfoExtractor):
19     IE_DESC = 'mitele.es'
20     _VALID_URL = r'https?://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P<id>[^/]+)/'
21
22     _TESTS = [{
23         'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/',
24         # MD5 is unstable
25         'info_dict': {
26             'id': '0NF1jJnxS1Wu3pHrmvFyw2',
27             'display_id': 'programa-144',
28             'ext': 'flv',
29             'title': 'Tor, la web invisible',
30             'description': 'md5:3b6fce7eaa41b2d97358726378d9369f',
31             'series': 'Diario de',
32             'season': 'La redacción',
33             'episode': 'Programa 144',
34             'thumbnail': 're:(?i)^https?://.*\.jpg$',
35             'duration': 2913,
36         },
37     }, {
38         # no explicit title
39         'url': 'http://www.mitele.es/programas-tv/cuarto-milenio/temporada-6/programa-226/',
40         'info_dict': {
41             'id': 'eLZSwoEd1S3pVyUm8lc6F',
42             'display_id': 'programa-226',
43             'ext': 'flv',
44             'title': 'Cuarto Milenio - Temporada 6 - Programa 226',
45             'description': 'md5:50daf9fadefa4e62d9fc866d0c015701',
46             'series': 'Cuarto Milenio',
47             'season': 'Temporada 6',
48             'episode': 'Programa 226',
49             'thumbnail': 're:(?i)^https?://.*\.jpg$',
50             'duration': 7312,
51         },
52         'params': {
53             'skip_download': True,
54         },
55     }]
56
57     def _real_extract(self, url):
58         display_id = self._match_id(url)
59
60         webpage = self._download_webpage(url, display_id)
61
62         config_url = self._search_regex(
63             r'data-config\s*=\s*"([^"]+)"', webpage, 'data config url')
64         config_url = compat_urlparse.urljoin(url, config_url)
65
66         config = self._download_json(
67             config_url, display_id, 'Downloading config JSON')
68
69         mmc = self._download_json(
70             config['services']['mmc'], display_id, 'Downloading mmc JSON')
71
72         formats = []
73         for location in mmc['locations']:
74             gat = self._proto_relative_url(location.get('gat'), 'http:')
75             bas = location.get('bas')
76             loc = location.get('loc')
77             ogn = location.get('ogn')
78             if None in (gat, bas, loc, ogn):
79                 continue
80             token_data = {
81                 'bas': bas,
82                 'icd': loc,
83                 'ogn': ogn,
84                 'sta': '0',
85             }
86             media = self._download_json(
87                 '%s/?%s' % (gat, compat_urllib_parse_urlencode(token_data)),
88                 display_id, 'Downloading %s JSON' % location['loc'])
89             file_ = media.get('file')
90             if not file_:
91                 continue
92             formats.extend(self._extract_f4m_formats(
93                 file_ + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18',
94                 display_id, f4m_id=loc))
95         self._sort_formats(formats)
96
97         title = self._search_regex(
98             r'class="Destacado-text"[^>]*>\s*<strong>([^<]+)</strong>',
99             webpage, 'title', default=None)
100
101         mobj = re.search(r'''(?sx)
102                             class="Destacado-text"[^>]*>.*?<h1>\s*
103                             <span>(?P<series>[^<]+)</span>\s*
104                             <span>(?P<season>[^<]+)</span>\s*
105                             <span>(?P<episode>[^<]+)</span>''', webpage)
106         series, season, episode = mobj.groups() if mobj else [None] * 3
107
108         if not title:
109             if mobj:
110                 title = '%s - %s - %s' % (series, season, episode)
111             else:
112                 title = remove_start(self._search_regex(
113                     r'<title>([^<]+)</title>', webpage, 'title'), 'Ver online ')
114
115         video_id = self._search_regex(
116             r'data-media-id\s*=\s*"([^"]+)"', webpage,
117             'data media id', default=None) or display_id
118         thumbnail = config.get('poster', {}).get('imageUrl')
119         duration = int_or_none(mmc.get('duration'))
120
121         return {
122             'id': video_id,
123             'display_id': display_id,
124             'title': title,
125             'description': get_element_by_attribute('class', 'text', webpage),
126             'series': series,
127             'season': season,
128             'episode': episode,
129             'thumbnail': thumbnail,
130             'duration': duration,
131             'formats': formats,
132         }