_ Git - youtube-dl/blob - youtube_dl/extractor/appletrailers.py

   1 from __future__ import unicode_literals
   2
   3 import re
   4 import json
   5
   6 from .common import InfoExtractor
   7 from ..compat import compat_urlparse
   8 from ..utils import (
   9     int_or_none,
  10 )
  11
  12
  13 class AppleTrailersIE(InfoExtractor):
  14     IE_NAME = 'appletrailers'
  15     _VALID_URL = r'https?://(?:www\.|movie)?trailers\.apple\.com/(?:trailers|ca)/(?P<company>[^/]+)/(?P<movie>[^/]+)'
  16     _TESTS = [{
  17         'url': 'http://trailers.apple.com/trailers/wb/manofsteel/',
  18         'info_dict': {
  19             'id': 'manofsteel',
  20         },
  21         'playlist': [
  22             {
  23                 'md5': 'd97a8e575432dbcb81b7c3acb741f8a8',
  24                 'info_dict': {
  25                     'id': 'manofsteel-trailer4',
  26                     'ext': 'mov',
  27                     'duration': 111,
  28                     'title': 'Trailer 4',
  29                     'upload_date': '20130523',
  30                     'uploader_id': 'wb',
  31                 },
  32             },
  33             {
  34                 'md5': 'b8017b7131b721fb4e8d6f49e1df908c',
  35                 'info_dict': {
  36                     'id': 'manofsteel-trailer3',
  37                     'ext': 'mov',
  38                     'duration': 182,
  39                     'title': 'Trailer 3',
  40                     'upload_date': '20130417',
  41                     'uploader_id': 'wb',
  42                 },
  43             },
  44             {
  45                 'md5': 'd0f1e1150989b9924679b441f3404d48',
  46                 'info_dict': {
  47                     'id': 'manofsteel-trailer',
  48                     'ext': 'mov',
  49                     'duration': 148,
  50                     'title': 'Trailer',
  51                     'upload_date': '20121212',
  52                     'uploader_id': 'wb',
  53                 },
  54             },
  55             {
  56                 'md5': '5fe08795b943eb2e757fa95cb6def1cb',
  57                 'info_dict': {
  58                     'id': 'manofsteel-teaser',
  59                     'ext': 'mov',
  60                     'duration': 93,
  61                     'title': 'Teaser',
  62                     'upload_date': '20120721',
  63                     'uploader_id': 'wb',
  64                 },
  65             },
  66         ]
  67     }, {
  68         'url': 'http://trailers.apple.com/trailers/magnolia/blackthorn/',
  69         'info_dict': {
  70             'id': 'blackthorn',
  71         },
  72         'playlist_mincount': 2,
  73     }, {
  74         'url': 'http://trailers.apple.com/ca/metropole/autrui/',
  75         'only_matching': True,
  76     }, {
  77         'url': 'http://movietrailers.apple.com/trailers/focus_features/kuboandthetwostrings/',
  78         'only_matching': True,
  79     }]
  80
  81     _JSON_RE = r'iTunes.playURL\((.*?)\);'
  82
  83     def _real_extract(self, url):
  84         mobj = re.match(self._VALID_URL, url)
  85         movie = mobj.group('movie')
  86         uploader_id = mobj.group('company')
  87
  88         playlist_url = compat_urlparse.urljoin(url, 'includes/playlists/itunes.inc')
  89
  90         def fix_html(s):
  91             s = re.sub(r'(?s)<script[^<]*?>.*?</script>', '', s)
  92             s = re.sub(r'<img ([^<]*?)/?>', r'<img \1/>', s)
  93             # The ' in the onClick attributes are not escaped, it couldn't be parsed
  94             # like: http://trailers.apple.com/trailers/wb/gravity/
  95
  96             def _clean_json(m):
  97                 return 'iTunes.playURL(%s);' % m.group(1).replace('\'', '&#39;')
  98             s = re.sub(self._JSON_RE, _clean_json, s)
  99             s = '<html>%s</html>' % s
 100             return s
 101         doc = self._download_xml(playlist_url, movie, transform_source=fix_html)
 102
 103         playlist = []
 104         for li in doc.findall('./div/ul/li'):
 105             on_click = li.find('.//a').attrib['onClick']
 106             trailer_info_json = self._search_regex(self._JSON_RE,
 107                                                    on_click, 'trailer info')
 108             trailer_info = json.loads(trailer_info_json)
 109             first_url = trailer_info.get('url')
 110             if not first_url:
 111                 continue
 112             title = trailer_info['title']
 113             video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower()
 114             thumbnail = li.find('.//img').attrib['src']
 115             upload_date = trailer_info['posted'].replace('-', '')
 116
 117             runtime = trailer_info['runtime']
 118             m = re.search(r'(?P<minutes>[0-9]+):(?P<seconds>[0-9]{1,2})', runtime)
 119             duration = None
 120             if m:
 121                 duration = 60 * int(m.group('minutes')) + int(m.group('seconds'))
 122
 123             trailer_id = first_url.split('/')[-1].rpartition('_')[0].lower()
 124             settings_json_url = compat_urlparse.urljoin(url, 'includes/settings/%s.json' % trailer_id)
 125             settings = self._download_json(settings_json_url, trailer_id, 'Downloading settings json')
 126
 127             formats = []
 128             for format in settings['metadata']['sizes']:
 129                 # The src is a file pointing to the real video file
 130                 format_url = re.sub(r'_(\d*p.mov)', r'_h\1', format['src'])
 131                 formats.append({
 132                     'url': format_url,
 133                     'format': format['type'],
 134                     'width': int_or_none(format['width']),
 135                     'height': int_or_none(format['height']),
 136                 })
 137
 138             self._sort_formats(formats)
 139
 140             playlist.append({
 141                 '_type': 'video',
 142                 'id': video_id,
 143                 'formats': formats,
 144                 'title': title,
 145                 'duration': duration,
 146                 'thumbnail': thumbnail,
 147                 'upload_date': upload_date,
 148                 'uploader_id': uploader_id,
 149                 'http_headers': {
 150                     'User-Agent': 'QuickTime compatible (youtube-dl)',
 151                 },
 152             })
 153
 154         return {
 155             '_type': 'playlist',
 156             'id': movie,
 157             'entries': playlist,
 158         }
 159
 160
 161 class AppleTrailersSectionIE(InfoExtractor):
 162     IE_NAME = 'appletrailers:section'
 163     _SECTIONS = {
 164         'justadded': {
 165             'feed_path': 'just_added',
 166             'title': 'Just Added',
 167         },
 168         'exclusive': {
 169             'feed_path': 'exclusive',
 170             'title': 'Exclusive',
 171         },
 172         'justhd': {
 173             'feed_path': 'just_hd',
 174             'title': 'Just HD',
 175         },
 176         'mostpopular': {
 177             'feed_path': 'most_pop',
 178             'title': 'Most Popular',
 179         },
 180         'moviestudios': {
 181             'feed_path': 'studios',
 182             'title': 'Movie Studios',
 183         },
 184     }
 185     _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/#section=(?P<id>%s)' % '|'.join(_SECTIONS)
 186     _TESTS = [{
 187         'url': 'http://trailers.apple.com/#section=justadded',
 188         'info_dict': {
 189             'title': 'Just Added',
 190             'id': 'justadded',
 191         },
 192         'playlist_mincount': 80,
 193     }, {
 194         'url': 'http://trailers.apple.com/#section=exclusive',
 195         'info_dict': {
 196             'title': 'Exclusive',
 197             'id': 'exclusive',
 198         },
 199         'playlist_mincount': 80,
 200     }, {
 201         'url': 'http://trailers.apple.com/#section=justhd',
 202         'info_dict': {
 203             'title': 'Just HD',
 204             'id': 'justhd',
 205         },
 206         'playlist_mincount': 80,
 207     }, {
 208         'url': 'http://trailers.apple.com/#section=mostpopular',
 209         'info_dict': {
 210             'title': 'Most Popular',
 211             'id': 'mostpopular',
 212         },
 213         'playlist_mincount': 80,
 214     }, {
 215         'url': 'http://trailers.apple.com/#section=moviestudios',
 216         'info_dict': {
 217             'title': 'Movie Studios',
 218             'id': 'moviestudios',
 219         },
 220         'playlist_mincount': 80,
 221     }]
 222
 223     def _real_extract(self, url):
 224         section = self._match_id(url)
 225         section_data = self._download_json(
 226             'http://trailers.apple.com/trailers/home/feeds/%s.json' % self._SECTIONS[section]['feed_path'],
 227             section)
 228         entries = [
 229             self.url_result('http://trailers.apple.com' + e['location'])
 230             for e in section_data]
 231         return self.playlist_result(entries, section, self._SECTIONS[section]['title'])