1 from __future__ import unicode_literals
5 from .common import InfoExtractor
8 compat_urllib_parse_unquote,
15 get_element_by_attribute,
20 class MetacafeIE(InfoExtractor):
21 _VALID_URL = r'https?://(?:www\.)?metacafe\.com/watch/(?P<video_id>[^/]+)/(?P<display_id>[^/?#]+)'
22 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
23 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
28 'add_ie': ['Youtube'],
29 'url': 'http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/',
33 'upload_date': '20090102',
34 'title': 'The Electric Company | "Short I" | PBS KIDS GO!',
35 'description': 'md5:2439a8ef6d5a70e380c22f5ad323e5a8',
40 # Normal metacafe video
42 'url': 'http://www.metacafe.com/watch/11121940/news_stuff_you_wont_do_with_your_playstation_4/',
43 'md5': '6e0bca200eaad2552e6915ed6fd4d9ad',
47 'title': 'News: Stuff You Won\'t Do with Your PlayStation 4',
49 'description': 'Sony released a massive FAQ on the PlayStation Blog detailing the PS4\'s capabilities and limitations.',
51 'skip': 'Page is temporarily unavailable.',
53 # metacafe video with family filter
55 'url': 'http://www.metacafe.com/watch/2155630/adult_art_by_david_hart_156/',
56 'md5': 'b06082c5079bbdcde677a6291fbdf376',
60 'title': 'Adult Art By David Hart #156',
61 'uploader': 'hartistry',
62 'description': 'Adult Art By David Hart. All the Art Works presented here are not in the possession of the American Artist, David John Hart. The paintings are in collections worldwide of individuals, countries, art museums, foundations and charities.',
67 'url': 'http://www.metacafe.com/watch/an-dVVXnuY7Jh77J/the_andromeda_strain_1971_stop_the_bomb_part_3/',
69 'id': 'an-dVVXnuY7Jh77J',
71 'title': 'The Andromeda Strain (1971): Stop the Bomb Part 3',
72 'uploader': 'AnyClip',
73 'description': 'md5:cbef0460d31e3807f6feb4e7a5952e5b',
76 # age-restricted video
78 'url': 'http://www.metacafe.com/watch/5186653/bbc_internal_christmas_tape_79_uncensored_outtakes_etc/',
79 'md5': '98dde7c1a35d02178e8ab7560fe8bd09',
83 'title': 'BBC INTERNAL Christmas Tape \'79 - UNCENSORED Outtakes, Etc.',
84 'uploader': 'Dwayne Pipe',
85 'description': 'md5:950bf4c581e2c059911fa3ffbe377e4b',
91 'url': 'http://www.metacafe.com/watch/cb-8VD4r_Zws8VP/open_this_is_face_the_nation_february_9/',
95 'title': 'Open: This is Face the Nation, February 9',
96 'description': 'md5:8a9ceec26d1f7ed6eab610834cc1a476',
98 'uploader': 'CBSI-NEW',
99 'upload_date': '20140209',
100 'timestamp': 1391959800,
104 'skip_download': True,
107 # Movieclips.com video
109 'url': 'http://www.metacafe.com/watch/mv-Wy7ZU/my_week_with_marilyn_do_you_love_me/',
113 'title': 'My Week with Marilyn - Do You Love Me?',
114 'description': 'From the movie My Week with Marilyn - Colin (Eddie Redmayne) professes his love to Marilyn (Michelle Williams) and gets her to promise to return to set and finish the movie.',
115 'uploader': 'movie_trailers',
119 'skip_download': 'requires rtmpdump',
124 def report_disclaimer(self):
125 self.to_screen('Retrieving disclaimer')
127 def _confirm_age(self):
128 # Retrieve disclaimer
129 self.report_disclaimer()
130 self._download_webpage(self._DISCLAIMER, None, False, 'Unable to retrieve disclaimer')
133 self.report_age_confirmation()
134 self._download_webpage(
135 self._FILTER_POST, None, False, 'Unable to confirm age',
136 data=urlencode_postdata({
138 'submit': "Continue - I'm over 18",
140 'Content-Type': 'application/x-www-form-urlencoded',
143 def _real_extract(self, url):
144 # Extract id and simplified title from URL
145 video_id, display_id = re.match(self._VALID_URL, url).groups()
147 # the video may come from an external site
148 m_external = re.match(r'^(\w{2})-(.*)$', video_id)
149 if m_external is not None:
150 prefix, ext_id = m_external.groups()
151 # Check if video comes from YouTube
153 return self.url_result('http://www.youtube.com/watch?v=%s' % ext_id, 'Youtube')
154 # CBS videos use theplatform.com
156 return self.url_result('theplatform:%s' % ext_id, 'ThePlatform')
158 # self._confirm_age()
160 # AnyClip videos require the flashversion cookie so that we get the link
163 headers['Cookie'] = 'user=%7B%22ffilter%22%3Afalse%7D;';
164 if video_id.startswith('an-'):
165 headers['Cookie'] += ' flashVersion=0;'
167 # Retrieve video webpage to extract further information
168 webpage = self._download_webpage(url, video_id, headers=headers)
170 error = get_element_by_attribute(
171 'class', 'notfound-page-title', webpage)
173 raise ExtractorError(error, expected=True)
175 video_title = self._html_search_meta(
176 ['og:title', 'twitter:title'], webpage, 'title', default=None) or self._search_regex(r'<h1>(.*?)</h1>', webpage, 'title')
178 # Extract URL, uploader and title from webpage
179 self.report_extraction(video_id)
181 mobj = re.search(r'(?m)&(?:media|video)URL=([^&]+)', webpage)
183 mediaURL = compat_urllib_parse_unquote(mobj.group(1))
184 video_ext = determine_ext(mediaURL)
186 # Extract gdaKey if available
187 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
191 gdaKey = mobj.group(1)
192 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
193 if video_url is None:
194 mobj = re.search(r'<video src="([^"]+)"', webpage)
196 video_url = mobj.group(1)
198 if video_url is None:
199 flashvars = self._search_regex(
200 r' name="flashvars" value="(.*?)"', webpage, 'flashvars',
203 vardict = compat_parse_qs(flashvars)
204 if 'mediaData' not in vardict:
205 raise ExtractorError('Unable to extract media URL')
207 r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
209 raise ExtractorError('Unable to extract media URL')
210 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
211 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
212 video_ext = determine_ext(video_url)
213 if video_url is None:
214 player_url = self._search_regex(
215 r"swfobject\.embedSWF\('([^']+)'",
216 webpage, 'config URL', default=None)
218 config_url = self._search_regex(
219 r'config=(.+)$', player_url, 'config URL')
220 config_doc = self._download_xml(
221 config_url, video_id,
222 note='Downloading video config')
223 smil_url = config_doc.find('.//properties').attrib['smil_file']
224 smil_doc = self._download_xml(
226 note='Downloading SMIL document')
227 base_url = smil_doc.find('./head/meta').attrib['base']
229 for vn in smil_doc.findall('.//video'):
230 br = int(vn.attrib['system-bitrate'])
231 play_path = vn.attrib['src']
233 'format_id': 'smil-%d' % br,
235 'play_path': play_path,
237 'player_url': player_url,
238 'ext': play_path.partition(':')[0],
240 if video_url is None:
241 flashvars = self._parse_json(self._search_regex(
242 r'flashvars\s*=\s*({.*});', webpage, 'flashvars',
243 default=None), video_id, fatal=False)
246 for source in flashvars.get('sources'):
247 source_url = source.get('src')
250 ext = mimetype2ext(source.get('type')) or determine_ext(source_url)
252 video_url.extend(self._extract_m3u8_formats(
253 source_url, video_id, 'mp4',
254 'm3u8_native', m3u8_id='hls', fatal=False))
261 if video_url is None:
262 raise ExtractorError('Unsupported video type')
264 description = self._html_search_meta(
265 ['og:description', 'twitter:description', 'description'],
266 webpage, 'title', fatal=False)
267 thumbnail = self._html_search_meta(
268 ['og:image', 'twitter:image'], webpage, 'title', fatal=False)
269 video_uploader = self._html_search_regex(
270 r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);',
271 webpage, 'uploader nickname', fatal=False)
272 duration = int_or_none(
273 self._html_search_meta('video:duration', webpage, default=None))
276 if re.search(r'(?:"contentRating":|"rating",)"restricted"', webpage)
279 if isinstance(video_url, list):
286 self._sort_formats(formats)
290 'display_id': display_id,
291 'description': description,
292 'uploader': video_uploader,
293 'title': video_title,
294 'thumbnail': thumbnail,
295 'age_limit': age_limit,
297 'duration': duration,