_ Git - youtube-dl/blob - youtube_dl/extractor/appletrailers.py

   1 from __future__ import unicode_literals
   2
   3 import re
   4 import json
   5
   6 from .common import InfoExtractor
   7 from ..compat import compat_urlparse
   8 from ..utils import (
   9     int_or_none,
  10 )
  11
  12
  13 class AppleTrailersIE(InfoExtractor):
  14     IE_NAME = 'appletrailers'
  15     _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/(?:trailers|ca)/(?P<company>[^/]+)/(?P<movie>[^/]+)'
  16     _TESTS = [{
  17         'url': 'http://trailers.apple.com/trailers/wb/manofsteel/',
  18         'info_dict': {
  19             'id': 'manofsteel',
  20         },
  21         'playlist': [
  22             {
  23                 'md5': 'd97a8e575432dbcb81b7c3acb741f8a8',
  24                 'info_dict': {
  25                     'id': 'manofsteel-trailer4',
  26                     'ext': 'mov',
  27                     'duration': 111,
  28                     'title': 'Trailer 4',
  29                     'upload_date': '20130523',
  30                     'uploader_id': 'wb',
  31                 },
  32             },
  33             {
  34                 'md5': 'b8017b7131b721fb4e8d6f49e1df908c',
  35                 'info_dict': {
  36                     'id': 'manofsteel-trailer3',
  37                     'ext': 'mov',
  38                     'duration': 182,
  39                     'title': 'Trailer 3',
  40                     'upload_date': '20130417',
  41                     'uploader_id': 'wb',
  42                 },
  43             },
  44             {
  45                 'md5': 'd0f1e1150989b9924679b441f3404d48',
  46                 'info_dict': {
  47                     'id': 'manofsteel-trailer',
  48                     'ext': 'mov',
  49                     'duration': 148,
  50                     'title': 'Trailer',
  51                     'upload_date': '20121212',
  52                     'uploader_id': 'wb',
  53                 },
  54             },
  55             {
  56                 'md5': '5fe08795b943eb2e757fa95cb6def1cb',
  57                 'info_dict': {
  58                     'id': 'manofsteel-teaser',
  59                     'ext': 'mov',
  60                     'duration': 93,
  61                     'title': 'Teaser',
  62                     'upload_date': '20120721',
  63                     'uploader_id': 'wb',
  64                 },
  65             },
  66         ]
  67     }, {
  68         'url': 'http://trailers.apple.com/trailers/magnolia/blackthorn/',
  69         'info_dict': {
  70             'id': 'blackthorn',
  71         },
  72         'playlist_mincount': 2,
  73     }, {
  74         'url': 'http://trailers.apple.com/ca/metropole/autrui/',
  75         'only_matching': True,
  76     }]
  77
  78     _JSON_RE = r'iTunes.playURL\((.*?)\);'
  79
  80     def _real_extract(self, url):
  81         mobj = re.match(self._VALID_URL, url)
  82         movie = mobj.group('movie')
  83         uploader_id = mobj.group('company')
  84
  85         playlist_url = compat_urlparse.urljoin(url, 'includes/playlists/itunes.inc')
  86
  87         def fix_html(s):
  88             s = re.sub(r'(?s)<script[^<]*?>.*?</script>', '', s)
  89             s = re.sub(r'<img ([^<]*?)/?>', r'<img \1/>', s)
  90             # The ' in the onClick attributes are not escaped, it couldn't be parsed
  91             # like: http://trailers.apple.com/trailers/wb/gravity/
  92
  93             def _clean_json(m):
  94                 return 'iTunes.playURL(%s);' % m.group(1).replace('\'', '&#39;')
  95             s = re.sub(self._JSON_RE, _clean_json, s)
  96             s = '<html>%s</html>' % s
  97             return s
  98         doc = self._download_xml(playlist_url, movie, transform_source=fix_html)
  99
 100         playlist = []
 101         for li in doc.findall('./div/ul/li'):
 102             on_click = li.find('.//a').attrib['onClick']
 103             trailer_info_json = self._search_regex(self._JSON_RE,
 104                                                    on_click, 'trailer info')
 105             trailer_info = json.loads(trailer_info_json)
 106             first_url = trailer_info.get('url')
 107             if not first_url:
 108                 continue
 109             title = trailer_info['title']
 110             video_id = movie + '-' + re.sub(r'[^a-zA-Z0-9]', '', title).lower()
 111             thumbnail = li.find('.//img').attrib['src']
 112             upload_date = trailer_info['posted'].replace('-', '')
 113
 114             runtime = trailer_info['runtime']
 115             m = re.search(r'(?P<minutes>[0-9]+):(?P<seconds>[0-9]{1,2})', runtime)
 116             duration = None
 117             if m:
 118                 duration = 60 * int(m.group('minutes')) + int(m.group('seconds'))
 119
 120             trailer_id = first_url.split('/')[-1].rpartition('_')[0].lower()
 121             settings_json_url = compat_urlparse.urljoin(url, 'includes/settings/%s.json' % trailer_id)
 122             settings = self._download_json(settings_json_url, trailer_id, 'Downloading settings json')
 123
 124             formats = []
 125             for format in settings['metadata']['sizes']:
 126                 # The src is a file pointing to the real video file
 127                 format_url = re.sub(r'_(\d*p.mov)', r'_h\1', format['src'])
 128                 formats.append({
 129                     'url': format_url,
 130                     'format': format['type'],
 131                     'width': int_or_none(format['width']),
 132                     'height': int_or_none(format['height']),
 133                 })
 134
 135             self._sort_formats(formats)
 136
 137             playlist.append({
 138                 '_type': 'video',
 139                 'id': video_id,
 140                 'formats': formats,
 141                 'title': title,
 142                 'duration': duration,
 143                 'thumbnail': thumbnail,
 144                 'upload_date': upload_date,
 145                 'uploader_id': uploader_id,
 146                 'http_headers': {
 147                     'User-Agent': 'QuickTime compatible (youtube-dl)',
 148                 },
 149             })
 150
 151         return {
 152             '_type': 'playlist',
 153             'id': movie,
 154             'entries': playlist,
 155         }
 156
 157
 158 class AppleTrailersSectionIE(InfoExtractor):
 159     IE_NAME = 'appletrailers:section'
 160     _SECTIONS = {
 161         'justadded': {
 162             'feed_path': 'just_added',
 163             'title': 'Just Added',
 164         },
 165         'exclusive': {
 166             'feed_path': 'exclusive',
 167             'title': 'Exclusive',
 168         },
 169         'justhd': {
 170             'feed_path': 'just_hd',
 171             'title': 'Just HD',
 172         },
 173         'mostpopular': {
 174             'feed_path': 'most_pop',
 175             'title': 'Most Popular',
 176         },
 177         'moviestudios': {
 178             'feed_path': 'studios',
 179             'title': 'Movie Studios',
 180         },
 181     }
 182     _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/#section=(?P<id>%s)' % '|'.join(_SECTIONS)
 183     _TESTS = [{
 184         'url': 'http://trailers.apple.com/#section=justadded',
 185         'info_dict': {
 186             'title': 'Just Added',
 187             'id': 'justadded',
 188         },
 189         'playlist_mincount': 80,
 190     }, {
 191         'url': 'http://trailers.apple.com/#section=exclusive',
 192         'info_dict': {
 193             'title': 'Exclusive',
 194             'id': 'exclusive',
 195         },
 196         'playlist_mincount': 80,
 197     }, {
 198         'url': 'http://trailers.apple.com/#section=justhd',
 199         'info_dict': {
 200             'title': 'Just HD',
 201             'id': 'justhd',
 202         },
 203         'playlist_mincount': 80,
 204     }, {
 205         'url': 'http://trailers.apple.com/#section=mostpopular',
 206         'info_dict': {
 207             'title': 'Most Popular',
 208             'id': 'mostpopular',
 209         },
 210         'playlist_mincount': 80,
 211     }, {
 212         'url': 'http://trailers.apple.com/#section=moviestudios',
 213         'info_dict': {
 214             'title': 'Movie Studios',
 215             'id': 'moviestudios',
 216         },
 217         'playlist_mincount': 80,
 218     }]
 219
 220     def _real_extract(self, url):
 221         section = self._match_id(url)
 222         section_data = self._download_json(
 223             'http://trailers.apple.com/trailers/home/feeds/%s.json' % self._SECTIONS[section]['feed_path'],
 224             section)
 225         entries = [
 226             self.url_result('http://trailers.apple.com' + e['location'])
 227             for e in section_data]
 228         return self.playlist_result(entries, section, self._SECTIONS[section]['title'])