_ Git - youtube-dl/blob - youtube_dl/extractor/metacafe.py

   1 from __future__ import unicode_literals
   2
   3 import re
   4
   5 from .common import InfoExtractor
   6 from ..compat import (
   7     compat_parse_qs,
   8     compat_urllib_parse_unquote,
   9 )
  10 from ..utils import (
  11     determine_ext,
  12     ExtractorError,
  13     int_or_none,
  14     urlencode_postdata,
  15     get_element_by_attribute,
  16     mimetype2ext,
  17 )
  18
  19
  20 class MetacafeIE(InfoExtractor):
  21     _VALID_URL = r'https?://(?:www\.)?metacafe\.com/watch/(?P<video_id>[^/]+)/(?P<display_id>[^/?#]+)'
  22     _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
  23     _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
  24     IE_NAME = 'metacafe'
  25     _TESTS = [
  26         # Youtube video
  27         {
  28             'add_ie': ['Youtube'],
  29             'url': 'http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/',
  30             'info_dict': {
  31                 'id': '_aUehQsCQtM',
  32                 'ext': 'mp4',
  33                 'upload_date': '20090102',
  34                 'title': 'The Electric Company | "Short I" | PBS KIDS GO!',
  35                 'description': 'md5:2439a8ef6d5a70e380c22f5ad323e5a8',
  36                 'uploader': 'PBS',
  37                 'uploader_id': 'PBS'
  38             }
  39         },
  40         # Normal metacafe video
  41         {
  42             'url': 'http://www.metacafe.com/watch/11121940/news_stuff_you_wont_do_with_your_playstation_4/',
  43             'md5': '6e0bca200eaad2552e6915ed6fd4d9ad',
  44             'info_dict': {
  45                 'id': '11121940',
  46                 'ext': 'mp4',
  47                 'title': 'News: Stuff You Won\'t Do with Your PlayStation 4',
  48                 'uploader': 'ign',
  49                 'description': 'Sony released a massive FAQ on the PlayStation Blog detailing the PS4\'s capabilities and limitations.',
  50             },
  51             'skip': 'Page is temporarily unavailable.',
  52         },
  53         # metacafe video with family filter
  54         {
  55             'url': 'http://www.metacafe.com/watch/2155630/adult_art_by_david_hart_156/',
  56             'md5': 'b06082c5079bbdcde677a6291fbdf376',
  57             'info_dict': {
  58                 'id': '2155630',
  59                 'ext': 'mp4',
  60                 'title': 'Adult Art By David Hart #156',
  61                 'uploader': 'hartistry',
  62                 'description': 'Adult Art By David Hart.  All the Art Works presented here are not in the possession of the American Artist, David John Hart.  The paintings are in collections worldwide of individuals, countries, art museums, foundations and charities.',
  63             }
  64         },
  65         # AnyClip video
  66         {
  67             'url': 'http://www.metacafe.com/watch/an-dVVXnuY7Jh77J/the_andromeda_strain_1971_stop_the_bomb_part_3/',
  68             'info_dict': {
  69                 'id': 'an-dVVXnuY7Jh77J',
  70                 'ext': 'mp4',
  71                 'title': 'The Andromeda Strain (1971): Stop the Bomb Part 3',
  72                 'uploader': 'AnyClip',
  73                 'description': 'md5:cbef0460d31e3807f6feb4e7a5952e5b',
  74             },
  75         },
  76         # age-restricted video
  77         {
  78             'url': 'http://www.metacafe.com/watch/5186653/bbc_internal_christmas_tape_79_uncensored_outtakes_etc/',
  79             'md5': '98dde7c1a35d02178e8ab7560fe8bd09',
  80             'info_dict': {
  81                 'id': '5186653',
  82                 'ext': 'mp4',
  83                 'title': 'BBC INTERNAL Christmas Tape \'79 - UNCENSORED Outtakes, Etc.',
  84                 'uploader': 'Dwayne Pipe',
  85                 'description': 'md5:950bf4c581e2c059911fa3ffbe377e4b',
  86                 'age_limit': 18,
  87             },
  88         },
  89         # cbs video
  90         {
  91             'url': 'http://www.metacafe.com/watch/cb-8VD4r_Zws8VP/open_this_is_face_the_nation_february_9/',
  92             'info_dict': {
  93                 'id': '8VD4r_Zws8VP',
  94                 'ext': 'flv',
  95                 'title': 'Open: This is Face the Nation, February 9',
  96                 'description': 'md5:8a9ceec26d1f7ed6eab610834cc1a476',
  97                 'duration': 96,
  98                 'uploader': 'CBSI-NEW',
  99                 'upload_date': '20140209',
 100                 'timestamp': 1391959800,
 101             },
 102             'params': {
 103                 # rtmp download
 104                 'skip_download': True,
 105             },
 106         },
 107         # Movieclips.com video
 108         {
 109             'url': 'http://www.metacafe.com/watch/mv-Wy7ZU/my_week_with_marilyn_do_you_love_me/',
 110             'info_dict': {
 111                 'id': 'mv-Wy7ZU',
 112                 'ext': 'mp4',
 113                 'title': 'My Week with Marilyn - Do You Love Me?',
 114                 'description': 'From the movie My Week with Marilyn - Colin (Eddie Redmayne) professes his love to Marilyn (Michelle Williams) and gets her to promise to return to set and finish the movie.',
 115                 'uploader': 'movie_trailers',
 116                 'duration': 176,
 117             },
 118             'params': {
 119                 'skip_download': 'requires rtmpdump',
 120             }
 121         }
 122     ]
 123
 124     def report_disclaimer(self):
 125         self.to_screen('Retrieving disclaimer')
 126
 127     def _confirm_age(self):
 128         # Retrieve disclaimer
 129         self.report_disclaimer()
 130         self._download_webpage(self._DISCLAIMER, None, False, 'Unable to retrieve disclaimer')
 131
 132         # Confirm age
 133         self.report_age_confirmation()
 134         self._download_webpage(
 135             self._FILTER_POST, None, False, 'Unable to confirm age',
 136             data=urlencode_postdata({
 137                 'filters': '0',
 138                 'submit': "Continue - I'm over 18",
 139             }), headers={
 140                 'Content-Type': 'application/x-www-form-urlencoded',
 141             })
 142
 143     def _real_extract(self, url):
 144         # Extract id and simplified title from URL
 145         video_id, display_id = re.match(self._VALID_URL, url).groups()
 146
 147         # the video may come from an external site
 148         m_external = re.match(r'^(\w{2})-(.*)$', video_id)
 149         if m_external is not None:
 150             prefix, ext_id = m_external.groups()
 151             # Check if video comes from YouTube
 152             if prefix == 'yt':
 153                 return self.url_result('http://www.youtube.com/watch?v=%s' % ext_id, 'Youtube')
 154             # CBS videos use theplatform.com
 155             if prefix == 'cb':
 156                 return self.url_result('theplatform:%s' % ext_id, 'ThePlatform')
 157
 158         # self._confirm_age()
 159
 160         # AnyClip videos require the flashversion cookie so that we get the link
 161         # to the mp4 file
 162         headers = {}
 163         headers['Cookie'] = 'user=%7B%22ffilter%22%3Afalse%7D;';
 164         if video_id.startswith('an-'):
 165             headers['Cookie'] += ' flashVersion=0;'
 166
 167         # Retrieve video webpage to extract further information
 168         webpage = self._download_webpage(url, video_id, headers=headers)
 169
 170         error = get_element_by_attribute(
 171             'class', 'notfound-page-title', webpage)
 172         if error:
 173             raise ExtractorError(error, expected=True)
 174
 175         video_title = self._html_search_meta(
 176             ['og:title', 'twitter:title'], webpage, 'title', default=None) or self._search_regex(r'<h1>(.*?)</h1>', webpage, 'title')
 177
 178         # Extract URL, uploader and title from webpage
 179         self.report_extraction(video_id)
 180         video_url = None
 181         mobj = re.search(r'(?m)&(?:media|video)URL=([^&]+)', webpage)
 182         if mobj is not None:
 183             mediaURL = compat_urllib_parse_unquote(mobj.group(1))
 184             video_ext = determine_ext(mediaURL)
 185
 186             # Extract gdaKey if available
 187             mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
 188             if mobj is None:
 189                 video_url = mediaURL
 190             else:
 191                 gdaKey = mobj.group(1)
 192                 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
 193         if video_url is None:
 194             mobj = re.search(r'<video src="([^"]+)"', webpage)
 195             if mobj:
 196                 video_url = mobj.group(1)
 197                 video_ext = 'mp4'
 198         if video_url is None:
 199             flashvars = self._search_regex(
 200                 r' name="flashvars" value="(.*?)"', webpage, 'flashvars',
 201                 default=None)
 202             if flashvars:
 203                 vardict = compat_parse_qs(flashvars)
 204                 if 'mediaData' not in vardict:
 205                     raise ExtractorError('Unable to extract media URL')
 206                 mobj = re.search(
 207                     r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
 208                 if mobj is None:
 209                     raise ExtractorError('Unable to extract media URL')
 210                 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
 211                 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
 212                 video_ext = determine_ext(video_url)
 213         if video_url is None:
 214             player_url = self._search_regex(
 215                 r"swfobject\.embedSWF\('([^']+)'",
 216                 webpage, 'config URL', default=None)
 217             if player_url:
 218                 config_url = self._search_regex(
 219                     r'config=(.+)$', player_url, 'config URL')
 220                 config_doc = self._download_xml(
 221                     config_url, video_id,
 222                     note='Downloading video config')
 223                 smil_url = config_doc.find('.//properties').attrib['smil_file']
 224                 smil_doc = self._download_xml(
 225                     smil_url, video_id,
 226                     note='Downloading SMIL document')
 227                 base_url = smil_doc.find('./head/meta').attrib['base']
 228                 video_url = []
 229                 for vn in smil_doc.findall('.//video'):
 230                     br = int(vn.attrib['system-bitrate'])
 231                     play_path = vn.attrib['src']
 232                     video_url.append({
 233                         'format_id': 'smil-%d' % br,
 234                         'url': base_url,
 235                         'play_path': play_path,
 236                         'page_url': url,
 237                         'player_url': player_url,
 238                         'ext': play_path.partition(':')[0],
 239                     })
 240         if video_url is None:
 241             flashvars = self._parse_json(self._search_regex(
 242                 r'flashvars\s*=\s*({.*});', webpage, 'flashvars',
 243                 default=None), video_id, fatal=False)
 244             if flashvars:
 245                 video_url = []
 246                 for source in flashvars.get('sources'):
 247                     source_url = source.get('src')
 248                     if not source_url:
 249                         continue
 250                     ext = mimetype2ext(source.get('type')) or determine_ext(source_url)
 251                     if ext == 'm3u8':
 252                         video_url.extend(self._extract_m3u8_formats(
 253                             source_url, video_id, 'mp4',
 254                             'm3u8_native', m3u8_id='hls', fatal=False))
 255                     else:
 256                         video_url.append({
 257                             'url': source_url,
 258                             'ext': ext,
 259                         })
 260
 261         if video_url is None:
 262             raise ExtractorError('Unsupported video type')
 263
 264         description = self._html_search_meta(
 265             ['og:description', 'twitter:description', 'description'],
 266             webpage, 'title', fatal=False)
 267         thumbnail = self._html_search_meta(
 268             ['og:image', 'twitter:image'], webpage, 'title', fatal=False)
 269         video_uploader = self._html_search_regex(
 270             r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);',
 271             webpage, 'uploader nickname', fatal=False)
 272         duration = int_or_none(
 273             self._html_search_meta('video:duration', webpage, default=None))
 274         age_limit = (
 275             18
 276             if re.search(r'(?:"contentRating":|"rating",)"restricted"', webpage)
 277             else 0)
 278
 279         if isinstance(video_url, list):
 280             formats = video_url
 281         else:
 282             formats = [{
 283                 'url': video_url,
 284                 'ext': video_ext,
 285             }]
 286         self._sort_formats(formats)
 287
 288         return {
 289             'id': video_id,
 290             'display_id': display_id,
 291             'description': description,
 292             'uploader': video_uploader,
 293             'title': video_title,
 294             'thumbnail': thumbnail,
 295             'age_limit': age_limit,
 296             'formats': formats,
 297             'duration': duration,
 298         }