Merge pull request #1956 from dstftw/master
[youtube-dl] / youtube_dl / extractor / appletrailers.py
1 import re
2 import json
3
4 from .common import InfoExtractor
5 from ..utils import (
6     compat_urlparse,
7     determine_ext,
8 )
9
10
11 class AppleTrailersIE(InfoExtractor):
12     _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/trailers/(?P<company>[^/]+)/(?P<movie>[^/]+)'
13     _TEST = {
14         u"url": u"http://trailers.apple.com/trailers/wb/manofsteel/",
15         u"playlist": [
16             {
17                 u"file": u"manofsteel-trailer4.mov",
18                 u"md5": u"d97a8e575432dbcb81b7c3acb741f8a8",
19                 u"info_dict": {
20                     u"duration": 111,
21                     u"title": u"Trailer 4",
22                     u"upload_date": u"20130523",
23                     u"uploader_id": u"wb",
24                 },
25             },
26             {
27                 u"file": u"manofsteel-trailer3.mov",
28                 u"md5": u"b8017b7131b721fb4e8d6f49e1df908c",
29                 u"info_dict": {
30                     u"duration": 182,
31                     u"title": u"Trailer 3",
32                     u"upload_date": u"20130417",
33                     u"uploader_id": u"wb",
34                 },
35             },
36             {
37                 u"file": u"manofsteel-trailer.mov",
38                 u"md5": u"d0f1e1150989b9924679b441f3404d48",
39                 u"info_dict": {
40                     u"duration": 148,
41                     u"title": u"Trailer",
42                     u"upload_date": u"20121212",
43                     u"uploader_id": u"wb",
44                 },
45             },
46             {
47                 u"file": u"manofsteel-teaser.mov",
48                 u"md5": u"5fe08795b943eb2e757fa95cb6def1cb",
49                 u"info_dict": {
50                     u"duration": 93,
51                     u"title": u"Teaser",
52                     u"upload_date": u"20120721",
53                     u"uploader_id": u"wb",
54                 },
55             }
56         ]
57     }
58
59     _JSON_RE = r'iTunes.playURL\((.*?)\);'
60
61     def _real_extract(self, url):
62         mobj = re.match(self._VALID_URL, url)
63         movie = mobj.group('movie')
64         uploader_id = mobj.group('company')
65
66         playlist_url = compat_urlparse.urljoin(url, u'includes/playlists/itunes.inc')
67         def fix_html(s):
68             s = re.sub(r'(?s)<script[^<]*?>.*?</script>', u'', s)
69             s = re.sub(r'<img ([^<]*?)>', r'<img \1/>', s)
70             # The ' in the onClick attributes are not escaped, it couldn't be parsed
71             # like: http://trailers.apple.com/trailers/wb/gravity/
72             def _clean_json(m):
73                 return u'iTunes.playURL(%s);' % m.group(1).replace('\'', '&#39;')
74             s = re.sub(self._JSON_RE, _clean_json, s)
75             s = u'<html>' + s + u'</html>'
76             return s
77         doc = self._download_xml(playlist_url, movie, transform_source=fix_html)
78
79         playlist = []
80         for li in doc.findall('./div/ul/li'):
81             on_click = li.find('.//a').attrib['onClick']
82             trailer_info_json = self._search_regex(self._JSON_RE,
83                 on_click, u'trailer info')
84             trailer_info = json.loads(trailer_info_json)
85             title = trailer_info['title']
86             video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower()
87             thumbnail = li.find('.//img').attrib['src']
88             upload_date = trailer_info['posted'].replace('-', '')
89
90             runtime = trailer_info['runtime']
91             m = re.search(r'(?P<minutes>[0-9]+):(?P<seconds>[0-9]{1,2})', runtime)
92             duration = None
93             if m:
94                 duration = 60 * int(m.group('minutes')) + int(m.group('seconds'))
95
96             first_url = trailer_info['url']
97             trailer_id = first_url.split('/')[-1].rpartition('_')[0].lower()
98             settings_json_url = compat_urlparse.urljoin(url, 'includes/settings/%s.json' % trailer_id)
99             settings_json = self._download_webpage(settings_json_url, trailer_id, u'Downloading settings json')
100             settings = json.loads(settings_json)
101
102             formats = []
103             for format in settings['metadata']['sizes']:
104                 # The src is a file pointing to the real video file
105                 format_url = re.sub(r'_(\d*p.mov)', r'_h\1', format['src'])
106                 formats.append({
107                     'url': format_url,
108                     'ext': determine_ext(format_url),
109                     'format': format['type'],
110                     'width': format['width'],
111                     'height': int(format['height']),
112                 })
113             formats = sorted(formats, key=lambda f: (f['height'], f['width']))
114
115             playlist.append({
116                 '_type': 'video',
117                 'id': video_id,
118                 'title': title,
119                 'formats': formats,
120                 'title': title,
121                 'duration': duration,
122                 'thumbnail': thumbnail,
123                 'upload_date': upload_date,
124                 'uploader_id': uploader_id,
125                 'user_agent': 'QuickTime compatible (youtube-dl)',
126             })
127
128         return {
129             '_type': 'playlist',
130             'id': movie,
131             'entries': playlist,
132         }