[youtube] Fix extraction.
[youtube-dl] / youtube_dl / extractor / joj.py
1 # coding: utf-8\r
2 from __future__ import unicode_literals\r
3 \r
4 import re\r
5 \r
6 from .common import InfoExtractor\r
7 from ..compat import compat_str\r
8 from ..utils import (\r
9     int_or_none,\r
10     js_to_json,\r
11     try_get,\r
12 )\r
13 \r
14 \r
15 class JojIE(InfoExtractor):\r
16     _VALID_URL = r'''(?x)\r
17                     (?:\r
18                         joj:|\r
19                         https?://media\.joj\.sk/embed/\r
20                     )\r
21                     (?P<id>[^/?#^]+)\r
22                 '''\r
23     _TESTS = [{\r
24         'url': 'https://media.joj.sk/embed/a388ec4c-6019-4a4a-9312-b1bee194e932',\r
25         'info_dict': {\r
26             'id': 'a388ec4c-6019-4a4a-9312-b1bee194e932',\r
27             'ext': 'mp4',\r
28             'title': 'NOVÉ BÝVANIE',\r
29             'thumbnail': r're:^https?://.*\.jpg$',\r
30             'duration': 3118,\r
31         }\r
32     }, {\r
33         'url': 'https://media.joj.sk/embed/9i1cxv',\r
34         'only_matching': True,\r
35     }, {\r
36         'url': 'joj:a388ec4c-6019-4a4a-9312-b1bee194e932',\r
37         'only_matching': True,\r
38     }, {\r
39         'url': 'joj:9i1cxv',\r
40         'only_matching': True,\r
41     }]\r
42 \r
43     @staticmethod\r
44     def _extract_urls(webpage):\r
45         return [\r
46             mobj.group('url')\r
47             for mobj in re.finditer(\r
48                 r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//media\.joj\.sk/embed/(?:(?!\1).)+)\1',\r
49                 webpage)]\r
50 \r
51     def _real_extract(self, url):\r
52         video_id = self._match_id(url)\r
53 \r
54         webpage = self._download_webpage(\r
55             'https://media.joj.sk/embed/%s' % video_id, video_id)\r
56 \r
57         title = self._search_regex(\r
58             (r'videoTitle\s*:\s*(["\'])(?P<title>(?:(?!\1).)+)\1',\r
59              r'<title>(?P<title>[^<]+)'), webpage, 'title',\r
60             default=None, group='title') or self._og_search_title(webpage)\r
61 \r
62         bitrates = self._parse_json(\r
63             self._search_regex(\r
64                 r'(?s)(?:src|bitrates)\s*=\s*({.+?});', webpage, 'bitrates',\r
65                 default='{}'),\r
66             video_id, transform_source=js_to_json, fatal=False)\r
67 \r
68         formats = []\r
69         for format_url in try_get(bitrates, lambda x: x['mp4'], list) or []:\r
70             if isinstance(format_url, compat_str):\r
71                 height = self._search_regex(\r
72                     r'(\d+)[pP]\.', format_url, 'height', default=None)\r
73                 formats.append({\r
74                     'url': format_url,\r
75                     'format_id': '%sp' % height if height else None,\r
76                     'height': int(height),\r
77                 })\r
78         if not formats:\r
79             playlist = self._download_xml(\r
80                 'https://media.joj.sk/services/Video.php?clip=%s' % video_id,\r
81                 video_id)\r
82             for file_el in playlist.findall('./files/file'):\r
83                 path = file_el.get('path')\r
84                 if not path:\r
85                     continue\r
86                 format_id = file_el.get('id') or file_el.get('label')\r
87                 formats.append({\r
88                     'url': 'http://n16.joj.sk/storage/%s' % path.replace(\r
89                         'dat/', '', 1),\r
90                     'format_id': format_id,\r
91                     'height': int_or_none(self._search_regex(\r
92                         r'(\d+)[pP]', format_id or path, 'height',\r
93                         default=None)),\r
94                 })\r
95         self._sort_formats(formats)\r
96 \r
97         thumbnail = self._og_search_thumbnail(webpage)\r
98 \r
99         duration = int_or_none(self._search_regex(\r
100             r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False))\r
101 \r
102         return {\r
103             'id': video_id,\r
104             'title': title,\r
105             'thumbnail': thumbnail,\r
106             'duration': duration,\r
107             'formats': formats,\r
108         }\r