_ Git - youtube-dl/blob - youtube_dl/extractor/appletrailers.py

   1 from __future__ import unicode_literals
   2
   3 import re
   4 import json
   5
   6 from .common import InfoExtractor
   7 from ..compat import compat_urlparse
   8 from ..utils import (
   9     int_or_none,
  10 )
  11
  12
  13 class AppleTrailersIE(InfoExtractor):
  14     IE_NAME = 'appletrailers'
  15     _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/(?:trailers|ca)/(?P<company>[^/]+)/(?P<movie>[^/]+)'
  16     _TESTS = [{
  17         'url': 'http://trailers.apple.com/trailers/wb/manofsteel/',
  18         'info_dict': {
  19             'id': 'manofsteel',
  20         },
  21         'playlist': [
  22             {
  23                 'md5': 'd97a8e575432dbcb81b7c3acb741f8a8',
  24                 'info_dict': {
  25                     'id': 'manofsteel-trailer4',
  26                     'ext': 'mov',
  27                     'duration': 111,
  28                     'title': 'Trailer 4',
  29                     'upload_date': '20130523',
  30                     'uploader_id': 'wb',
  31                 },
  32             },
  33             {
  34                 'md5': 'b8017b7131b721fb4e8d6f49e1df908c',
  35                 'info_dict': {
  36                     'id': 'manofsteel-trailer3',
  37                     'ext': 'mov',
  38                     'duration': 182,
  39                     'title': 'Trailer 3',
  40                     'upload_date': '20130417',
  41                     'uploader_id': 'wb',
  42                 },
  43             },
  44             {
  45                 'md5': 'd0f1e1150989b9924679b441f3404d48',
  46                 'info_dict': {
  47                     'id': 'manofsteel-trailer',
  48                     'ext': 'mov',
  49                     'duration': 148,
  50                     'title': 'Trailer',
  51                     'upload_date': '20121212',
  52                     'uploader_id': 'wb',
  53                 },
  54             },
  55             {
  56                 'md5': '5fe08795b943eb2e757fa95cb6def1cb',
  57                 'info_dict': {
  58                     'id': 'manofsteel-teaser',
  59                     'ext': 'mov',
  60                     'duration': 93,
  61                     'title': 'Teaser',
  62                     'upload_date': '20120721',
  63                     'uploader_id': 'wb',
  64                 },
  65             },
  66         ]
  67     }, {
  68         'url': 'http://trailers.apple.com/ca/metropole/autrui/',
  69         'only_matching': True,
  70     }]
  71
  72     _JSON_RE = r'iTunes.playURL\((.*?)\);'
  73
  74     def _real_extract(self, url):
  75         mobj = re.match(self._VALID_URL, url)
  76         movie = mobj.group('movie')
  77         uploader_id = mobj.group('company')
  78
  79         playlist_url = compat_urlparse.urljoin(url, 'includes/playlists/itunes.inc')
  80
  81         def fix_html(s):
  82             s = re.sub(r'(?s)<script[^<]*?>.*?</script>', '', s)
  83             s = re.sub(r'<img ([^<]*?)/?>', r'<img \1/>', s)
  84             # The ' in the onClick attributes are not escaped, it couldn't be parsed
  85             # like: http://trailers.apple.com/trailers/wb/gravity/
  86
  87             def _clean_json(m):
  88                 return 'iTunes.playURL(%s);' % m.group(1).replace('\'', '&#39;')
  89             s = re.sub(self._JSON_RE, _clean_json, s)
  90             s = '<html>%s</html>' % s
  91             return s
  92         doc = self._download_xml(playlist_url, movie, transform_source=fix_html)
  93
  94         playlist = []
  95         for li in doc.findall('./div/ul/li'):
  96             on_click = li.find('.//a').attrib['onClick']
  97             trailer_info_json = self._search_regex(self._JSON_RE,
  98                                                    on_click, 'trailer info')
  99             trailer_info = json.loads(trailer_info_json)
 100             title = trailer_info['title']
 101             video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower()
 102             thumbnail = li.find('.//img').attrib['src']
 103             upload_date = trailer_info['posted'].replace('-', '')
 104
 105             runtime = trailer_info['runtime']
 106             m = re.search(r'(?P<minutes>[0-9]+):(?P<seconds>[0-9]{1,2})', runtime)
 107             duration = None
 108             if m:
 109                 duration = 60 * int(m.group('minutes')) + int(m.group('seconds'))
 110
 111             first_url = trailer_info['url']
 112             trailer_id = first_url.split('/')[-1].rpartition('_')[0].lower()
 113             settings_json_url = compat_urlparse.urljoin(url, 'includes/settings/%s.json' % trailer_id)
 114             settings = self._download_json(settings_json_url, trailer_id, 'Downloading settings json')
 115
 116             formats = []
 117             for format in settings['metadata']['sizes']:
 118                 # The src is a file pointing to the real video file
 119                 format_url = re.sub(r'_(\d*p.mov)', r'_h\1', format['src'])
 120                 formats.append({
 121                     'url': format_url,
 122                     'format': format['type'],
 123                     'width': int_or_none(format['width']),
 124                     'height': int_or_none(format['height']),
 125                 })
 126
 127             self._sort_formats(formats)
 128
 129             playlist.append({
 130                 '_type': 'video',
 131                 'id': video_id,
 132                 'formats': formats,
 133                 'title': title,
 134                 'duration': duration,
 135                 'thumbnail': thumbnail,
 136                 'upload_date': upload_date,
 137                 'uploader_id': uploader_id,
 138                 'http_headers': {
 139                     'User-Agent': 'QuickTime compatible (youtube-dl)',
 140                 },
 141             })
 142
 143         return {
 144             '_type': 'playlist',
 145             'id': movie,
 146             'entries': playlist,
 147         }
 148
 149
 150 class AppleTrailersSectionIE(InfoExtractor):
 151     IE_NAME = 'appletrailers:section'
 152     _SECTIONS = {
 153         'justadded': {
 154             'feed_path': 'just_added',
 155             'title': 'Just Added',
 156         },
 157         'exclusive': {
 158             'feed_path': 'exclusive',
 159             'title': 'Exclusive',
 160         },
 161         'justhd': {
 162             'feed_path': 'just_hd',
 163             'title': 'Just HD',
 164         },
 165         'mostpopular': {
 166             'feed_path': 'most_pop',
 167             'title': 'Most Popular',
 168         },
 169         'moviestudios': {
 170             'feed_path': 'studios',
 171             'title': 'Movie Studios',
 172         },
 173     }
 174     _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/#section=(?P<id>%s)' % '|'.join(_SECTIONS)
 175     _TESTS = [{
 176         'url': 'http://trailers.apple.com/#section=justadded',
 177         'info_dict': {
 178             'title': 'Just Added',
 179             'id': 'justadded',
 180         },
 181         'playlist_mincount': 80,
 182     }, {
 183         'url': 'http://trailers.apple.com/#section=exclusive',
 184         'info_dict': {
 185             'title': 'Exclusive',
 186             'id': 'exclusive',
 187         },
 188         'playlist_mincount': 80,
 189     }, {
 190         'url': 'http://trailers.apple.com/#section=justhd',
 191         'info_dict': {
 192             'title': 'Just HD',
 193             'id': 'justhd',
 194         },
 195         'playlist_mincount': 80,
 196     }, {
 197         'url': 'http://trailers.apple.com/#section=mostpopular',
 198         'info_dict': {
 199             'title': 'Most Popular',
 200             'id': 'mostpopular',
 201         },
 202         'playlist_mincount': 80,
 203     }, {
 204         'url': 'http://trailers.apple.com/#section=moviestudios',
 205         'info_dict': {
 206             'title': 'Movie Studios',
 207             'id': 'moviestudios',
 208         },
 209         'playlist_mincount': 80,
 210     }]
 211
 212     def _real_extract(self, url):
 213         section = self._match_id(url)
 214         section_data = self._download_json(
 215             'http://trailers.apple.com/trailers/home/feeds/%s.json' % self._SECTIONS[section]['feed_path'],
 216             section)
 217         entries = [
 218             self.url_result('http://trailers.apple.com' + e['location'])
 219             for e in section_data]
 220         return self.playlist_result(entries, section, self._SECTIONS[section]['title'])