2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
26 from .extractor.common import InfoExtractor, SearchInfoExtractor
27 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeUserIE, YoutubeChannelIE
31 class MetacafeIE(InfoExtractor):
32 """Information Extractor for metacafe.com."""
34 _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*'
35 _DISCLAIMER = 'http://www.metacafe.com/family_filter/'
36 _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
39 def report_disclaimer(self):
40 """Report disclaimer retrieval."""
41 self.to_screen(u'Retrieving disclaimer')
43 def _real_initialize(self):
45 request = compat_urllib_request.Request(self._DISCLAIMER)
47 self.report_disclaimer()
48 disclaimer = compat_urllib_request.urlopen(request).read()
49 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
50 raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err))
55 'submit': "Continue - I'm over 18",
57 request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form))
59 self.report_age_confirmation()
60 disclaimer = compat_urllib_request.urlopen(request).read()
61 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
62 raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
64 def _real_extract(self, url):
65 # Extract id and simplified title from URL
66 mobj = re.match(self._VALID_URL, url)
68 raise ExtractorError(u'Invalid URL: %s' % url)
70 video_id = mobj.group(1)
72 # Check if video comes from YouTube
73 mobj2 = re.match(r'^yt-(.*)$', video_id)
75 return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
77 # Retrieve video webpage to extract further information
78 webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
80 # Extract URL, uploader and title from webpage
81 self.report_extraction(video_id)
82 mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
84 mediaURL = compat_urllib_parse.unquote(mobj.group(1))
85 video_extension = mediaURL[-3:]
87 # Extract gdaKey if available
88 mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
92 gdaKey = mobj.group(1)
93 video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
95 mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
97 raise ExtractorError(u'Unable to extract media URL')
98 vardict = compat_parse_qs(mobj.group(1))
99 if 'mediaData' not in vardict:
100 raise ExtractorError(u'Unable to extract media URL')
101 mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
103 raise ExtractorError(u'Unable to extract media URL')
104 mediaURL = mobj.group('mediaURL').replace('\\/', '/')
105 video_extension = mediaURL[-3:]
106 video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
108 mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
110 raise ExtractorError(u'Unable to extract title')
111 video_title = mobj.group(1).decode('utf-8')
113 mobj = re.search(r'submitter=(.*?);', webpage)
115 raise ExtractorError(u'Unable to extract uploader nickname')
116 video_uploader = mobj.group(1)
119 'id': video_id.decode('utf-8'),
120 'url': video_url.decode('utf-8'),
121 'uploader': video_uploader.decode('utf-8'),
123 'title': video_title,
124 'ext': video_extension.decode('utf-8'),
127 class DailymotionIE(InfoExtractor):
128 """Information Extractor for Dailymotion"""
130 _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
131 IE_NAME = u'dailymotion'
133 def _real_extract(self, url):
134 # Extract id and simplified title from URL
135 mobj = re.match(self._VALID_URL, url)
137 raise ExtractorError(u'Invalid URL: %s' % url)
139 video_id = mobj.group(1).split('_')[0].split('?')[0]
141 video_extension = 'mp4'
143 # Retrieve video webpage to extract further information
144 request = compat_urllib_request.Request(url)
145 request.add_header('Cookie', 'family_filter=off')
146 webpage = self._download_webpage(request, video_id)
148 # Extract URL, uploader and title from webpage
149 self.report_extraction(video_id)
150 mobj = re.search(r'\s*var flashvars = (.*)', webpage)
152 raise ExtractorError(u'Unable to extract media URL')
153 flashvars = compat_urllib_parse.unquote(mobj.group(1))
155 for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']:
158 self.to_screen(u'Using %s' % key)
161 raise ExtractorError(u'Unable to extract video URL')
163 mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars)
165 raise ExtractorError(u'Unable to extract video URL')
167 video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/')
169 # TODO: support choosing qualities
171 mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
173 raise ExtractorError(u'Unable to extract title')
174 video_title = unescapeHTML(mobj.group('title'))
176 video_uploader = None
177 video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
178 # Looking for official user
179 r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
180 webpage, 'video uploader')
182 video_upload_date = None
183 mobj = re.search(r'<div class="[^"]*uploaded_cont[^"]*" title="[^"]*">([0-9]{2})-([0-9]{2})-([0-9]{4})</div>', webpage)
185 video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
190 'uploader': video_uploader,
191 'upload_date': video_upload_date,
192 'title': video_title,
193 'ext': video_extension,
197 class PhotobucketIE(InfoExtractor):
198 """Information extractor for photobucket.com."""
200 # TODO: the original _VALID_URL was:
201 # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)'
202 # Check if it's necessary to keep the old extracion process
203 _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P<id>.*)\.(?P<ext>(flv)|(mp4))'
204 IE_NAME = u'photobucket'
206 def _real_extract(self, url):
207 # Extract id from URL
208 mobj = re.match(self._VALID_URL, url)
210 raise ExtractorError(u'Invalid URL: %s' % url)
212 video_id = mobj.group('id')
214 video_extension = mobj.group('ext')
216 # Retrieve video webpage to extract further information
217 webpage = self._download_webpage(url, video_id)
219 # Extract URL, uploader, and title from webpage
220 self.report_extraction(video_id)
221 # We try first by looking the javascript code:
222 mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P<json>.*?)\);', webpage)
224 info = json.loads(mobj.group('json'))
227 'url': info[u'downloadUrl'],
228 'uploader': info[u'username'],
229 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'),
230 'title': info[u'title'],
231 'ext': video_extension,
232 'thumbnail': info[u'thumbUrl'],
235 # We try looking in other parts of the webpage
236 video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
237 webpage, u'video URL')
239 mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
241 raise ExtractorError(u'Unable to extract title')
242 video_title = mobj.group(1).decode('utf-8')
243 video_uploader = mobj.group(2).decode('utf-8')
246 'id': video_id.decode('utf-8'),
247 'url': video_url.decode('utf-8'),
248 'uploader': video_uploader,
250 'title': video_title,
251 'ext': video_extension.decode('utf-8'),
255 class YahooIE(InfoExtractor):
256 """Information extractor for screen.yahoo.com."""
257 _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
259 def _real_extract(self, url):
260 mobj = re.match(self._VALID_URL, url)
262 raise ExtractorError(u'Invalid URL: %s' % url)
263 video_id = mobj.group('id')
264 webpage = self._download_webpage(url, video_id)
265 m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage)
268 # TODO: Check which url parameters are required
269 info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
270 webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage')
271 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
272 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
273 <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
274 <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
276 self.report_extraction(video_id)
277 m_info = re.search(info_re, webpage, re.VERBOSE|re.DOTALL)
279 raise ExtractorError(u'Unable to extract video info')
280 video_title = m_info.group('title')
281 video_description = m_info.group('description')
282 video_thumb = m_info.group('thumb')
283 video_date = m_info.group('date')
284 video_date = datetime.datetime.strptime(video_date, '%m/%d/%Y').strftime('%Y%m%d')
286 # TODO: Find a way to get mp4 videos
287 rest_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
288 webpage = self._download_webpage(rest_url, video_id, u'Downloading video url webpage')
289 m_rest = re.search(r'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage)
290 video_url = m_rest.group('url')
291 video_path = m_rest.group('path')
293 raise ExtractorError(u'Unable to extract video url')
295 else: # We have to use a different method if another id is defined
296 long_id = m_id.group('new_id')
297 info_url = 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id + '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
298 webpage = self._download_webpage(info_url, video_id, u'Downloading info json')
299 json_str = re.search(r'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage).group(1)
300 info = json.loads(json_str)
301 res = info[u'query'][u'results'][u'mediaObj'][0]
302 stream = res[u'streams'][0]
303 video_path = stream[u'path']
304 video_url = stream[u'host']
306 video_title = meta[u'title']
307 video_description = meta[u'description']
308 video_thumb = meta[u'thumbnail']
309 video_date = None # I can't find it
314 'play_path': video_path,
316 'description': video_description,
317 'thumbnail': video_thumb,
318 'upload_date': video_date,
323 class VimeoIE(InfoExtractor):
324 """Information extractor for vimeo.com."""
326 # _VALID_URL matches Vimeo URLs
327 _VALID_URL = r'(?P<proto>https?://)?(?:(?:www|player)\.)?vimeo(?P<pro>pro)?\.com/(?:(?:(?:groups|album)/[^/]+)|(?:.*?)/)?(?P<direct_link>play_redirect_hls\?clip_id=)?(?:videos?/)?(?P<id>[0-9]+)'
330 def _verify_video_password(self, url, video_id, webpage):
331 password = self._downloader.params.get('password', None)
333 raise ExtractorError(u'This video is protected by a password, use the --password option')
334 token = re.search(r'xsrft: \'(.*?)\'', webpage).group(1)
335 data = compat_urllib_parse.urlencode({'password': password,
337 # I didn't manage to use the password with https
338 if url.startswith('https'):
339 pass_url = url.replace('https','http')
342 password_request = compat_urllib_request.Request(pass_url+'/password', data)
343 password_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
344 password_request.add_header('Cookie', 'xsrft=%s' % token)
345 pass_web = self._download_webpage(password_request, video_id,
346 u'Verifying the password',
349 def _real_extract(self, url, new_video=True):
350 # Extract ID from URL
351 mobj = re.match(self._VALID_URL, url)
353 raise ExtractorError(u'Invalid URL: %s' % url)
355 video_id = mobj.group('id')
356 if not mobj.group('proto'):
357 url = 'https://' + url
358 if mobj.group('direct_link') or mobj.group('pro'):
359 url = 'https://vimeo.com/' + video_id
361 # Retrieve video webpage to extract further information
362 request = compat_urllib_request.Request(url, None, std_headers)
363 webpage = self._download_webpage(request, video_id)
365 # Now we begin extracting as much information as we can from what we
366 # retrieved. First we extract the information common to all extractors,
367 # and latter we extract those that are Vimeo specific.
368 self.report_extraction(video_id)
370 # Extract the config JSON
372 config = webpage.split(' = {config:')[1].split(',assets:')[0]
373 config = json.loads(config)
375 if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage):
376 raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option')
378 if re.search('If so please provide the correct password.', webpage):
379 self._verify_video_password(url, video_id, webpage)
380 return self._real_extract(url)
382 raise ExtractorError(u'Unable to extract info section')
385 video_title = config["video"]["title"]
387 # Extract uploader and uploader_id
388 video_uploader = config["video"]["owner"]["name"]
389 video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
391 # Extract video thumbnail
392 video_thumbnail = config["video"]["thumbnail"]
394 # Extract video description
395 video_description = get_element_by_attribute("itemprop", "description", webpage)
396 if video_description: video_description = clean_html(video_description)
397 else: video_description = u''
399 # Extract upload date
400 video_upload_date = None
401 mobj = re.search(r'<meta itemprop="dateCreated" content="(\d{4})-(\d{2})-(\d{2})T', webpage)
403 video_upload_date = mobj.group(1) + mobj.group(2) + mobj.group(3)
405 # Vimeo specific: extract request signature and timestamp
406 sig = config['request']['signature']
407 timestamp = config['request']['timestamp']
409 # Vimeo specific: extract video codec and quality information
410 # First consider quality, then codecs, then take everything
411 # TODO bind to format param
412 codecs = [('h264', 'mp4'), ('vp8', 'flv'), ('vp6', 'flv')]
413 files = { 'hd': [], 'sd': [], 'other': []}
414 for codec_name, codec_extension in codecs:
415 if codec_name in config["video"]["files"]:
416 if 'hd' in config["video"]["files"][codec_name]:
417 files['hd'].append((codec_name, codec_extension, 'hd'))
418 elif 'sd' in config["video"]["files"][codec_name]:
419 files['sd'].append((codec_name, codec_extension, 'sd'))
421 files['other'].append((codec_name, codec_extension, config["video"]["files"][codec_name][0]))
423 for quality in ('hd', 'sd', 'other'):
424 if len(files[quality]) > 0:
425 video_quality = files[quality][0][2]
426 video_codec = files[quality][0][0]
427 video_extension = files[quality][0][1]
428 self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality))
431 raise ExtractorError(u'No known codec found')
433 video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \
434 %(video_id, sig, timestamp, video_quality, video_codec.upper())
439 'uploader': video_uploader,
440 'uploader_id': video_uploader_id,
441 'upload_date': video_upload_date,
442 'title': video_title,
443 'ext': video_extension,
444 'thumbnail': video_thumbnail,
445 'description': video_description,
449 class ArteTvIE(InfoExtractor):
450 """arte.tv information extractor."""
452 _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*'
453 _LIVE_URL = r'index-[0-9]+\.html$'
457 def fetch_webpage(self, url):
458 request = compat_urllib_request.Request(url)
460 self.report_download_webpage(url)
461 webpage = compat_urllib_request.urlopen(request).read()
462 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
463 raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err))
464 except ValueError as err:
465 raise ExtractorError(u'Invalid URL: %s' % url)
468 def grep_webpage(self, url, regex, regexFlags, matchTuples):
469 page = self.fetch_webpage(url)
470 mobj = re.search(regex, page, regexFlags)
474 raise ExtractorError(u'Invalid URL: %s' % url)
476 for (i, key, err) in matchTuples:
477 if mobj.group(i) is None:
478 raise ExtractorError(err)
480 info[key] = mobj.group(i)
484 def extractLiveStream(self, url):
485 video_lang = url.split('/')[-4]
486 info = self.grep_webpage(
488 r'src="(.*?/videothek_js.*?\.js)',
491 (1, 'url', u'Invalid URL: %s' % url)
494 http_host = url.split('/')[2]
495 next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url')))
496 info = self.grep_webpage(
498 r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' +
499 '(http://.*?\.swf).*?' +
503 (1, 'path', u'could not extract video path: %s' % url),
504 (2, 'player', u'could not extract video player: %s' % url),
505 (3, 'url', u'could not extract video url: %s' % url)
508 video_url = u'%s/%s' % (info.get('url'), info.get('path'))
510 def extractPlus7Stream(self, url):
511 video_lang = url.split('/')[-3]
512 info = self.grep_webpage(
514 r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)',
517 (1, 'url', u'Invalid URL: %s' % url)
520 next_url = compat_urllib_parse.unquote(info.get('url'))
521 info = self.grep_webpage(
523 r'<video lang="%s" ref="(http[^\'"&]*)' % video_lang,
526 (1, 'url', u'Could not find <video> tag: %s' % url)
529 next_url = compat_urllib_parse.unquote(info.get('url'))
531 info = self.grep_webpage(
533 r'<video id="(.*?)".*?>.*?' +
534 '<name>(.*?)</name>.*?' +
535 '<dateVideo>(.*?)</dateVideo>.*?' +
536 '<url quality="hd">(.*?)</url>',
539 (1, 'id', u'could not extract video id: %s' % url),
540 (2, 'title', u'could not extract video title: %s' % url),
541 (3, 'date', u'could not extract video date: %s' % url),
542 (4, 'url', u'could not extract video url: %s' % url)
547 'id': info.get('id'),
548 'url': compat_urllib_parse.unquote(info.get('url')),
549 'uploader': u'arte.tv',
550 'upload_date': unified_strdate(info.get('date')),
551 'title': info.get('title').decode('utf-8'),
557 def _real_extract(self, url):
558 video_id = url.split('/')[-1]
559 self.report_extraction(video_id)
561 if re.search(self._LIVE_URL, video_id) is not None:
562 self.extractLiveStream(url)
565 info = self.extractPlus7Stream(url)
570 class GenericIE(InfoExtractor):
571 """Generic last-resort information extractor."""
576 def report_download_webpage(self, video_id):
577 """Report webpage download."""
578 if not self._downloader.params.get('test', False):
579 self._downloader.report_warning(u'Falling back on generic information extractor.')
580 super(GenericIE, self).report_download_webpage(video_id)
582 def report_following_redirect(self, new_url):
583 """Report information extraction."""
584 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
586 def _test_redirect(self, url):
587 """Check if it is a redirect, like url shorteners, in case return the new url."""
588 class HeadRequest(compat_urllib_request.Request):
589 def get_method(self):
592 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
594 Subclass the HTTPRedirectHandler to make it use our
595 HeadRequest also on the redirected URL
597 def redirect_request(self, req, fp, code, msg, headers, newurl):
598 if code in (301, 302, 303, 307):
599 newurl = newurl.replace(' ', '%20')
600 newheaders = dict((k,v) for k,v in req.headers.items()
601 if k.lower() not in ("content-length", "content-type"))
602 return HeadRequest(newurl,
604 origin_req_host=req.get_origin_req_host(),
607 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
609 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
611 Fallback to GET if HEAD is not allowed (405 HTTP error)
613 def http_error_405(self, req, fp, code, msg, headers):
617 newheaders = dict((k,v) for k,v in req.headers.items()
618 if k.lower() not in ("content-length", "content-type"))
619 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
621 origin_req_host=req.get_origin_req_host(),
625 opener = compat_urllib_request.OpenerDirector()
626 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
627 HTTPMethodFallback, HEADRedirectHandler,
628 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
629 opener.add_handler(handler())
631 response = opener.open(HeadRequest(url))
633 raise ExtractorError(u'Invalid URL protocol')
634 new_url = response.geturl()
639 self.report_following_redirect(new_url)
642 def _real_extract(self, url):
643 new_url = self._test_redirect(url)
644 if new_url: return [self.url_result(new_url)]
646 video_id = url.split('/')[-1]
648 webpage = self._download_webpage(url, video_id)
649 except ValueError as err:
650 # since this is the last-resort InfoExtractor, if
651 # this error is thrown, it'll be thrown here
652 raise ExtractorError(u'Invalid URL: %s' % url)
654 self.report_extraction(video_id)
655 # Start with something easy: JW Player in SWFObject
656 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
658 # Broaden the search a little bit
659 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
661 # Broaden the search a little bit: JWPlayer JS loader
662 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
664 # Try to find twitter cards info
665 mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
667 # We look for Open Graph info:
668 # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
669 m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
670 # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
671 if m_video_type is not None:
672 mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
674 raise ExtractorError(u'Invalid URL: %s' % url)
676 # It's possible that one of the regexes
677 # matched, but returned an empty group:
678 if mobj.group(1) is None:
679 raise ExtractorError(u'Invalid URL: %s' % url)
681 video_url = compat_urllib_parse.unquote(mobj.group(1))
682 video_id = os.path.basename(video_url)
684 # here's a fun little line of code for you:
685 video_extension = os.path.splitext(video_id)[1][1:]
686 video_id = os.path.splitext(video_id)[0]
688 # it's tempting to parse this further, but you would
689 # have to take into account all the variations like
690 # Video Title - Site Name
691 # Site Name | Video Title
692 # Video Title - Tagline | Site Name
693 # and so on and so forth; it's just not practical
694 video_title = self._html_search_regex(r'<title>(.*)</title>',
695 webpage, u'video title')
697 # video uploader is domain name
698 video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
699 url, u'video uploader')
704 'uploader': video_uploader,
706 'title': video_title,
707 'ext': video_extension,
711 class YoutubeSearchIE(SearchInfoExtractor):
712 """Information Extractor for YouTube search queries."""
713 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
715 IE_NAME = u'youtube:search'
716 _SEARCH_KEY = 'ytsearch'
718 def report_download_page(self, query, pagenum):
719 """Report attempt to download search page with given number."""
720 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
722 def _get_n_results(self, query, n):
723 """Get a specified number of results for a query"""
729 while (50 * pagenum) < limit:
730 self.report_download_page(query, pagenum+1)
731 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
732 request = compat_urllib_request.Request(result_url)
734 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
735 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
736 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
737 api_response = json.loads(data)['data']
739 if not 'items' in api_response:
740 raise ExtractorError(u'[youtube] No video results')
742 new_ids = list(video['id'] for video in api_response['items'])
745 limit = min(n, api_response['totalItems'])
748 if len(video_ids) > n:
749 video_ids = video_ids[:n]
750 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
751 return self.playlist_result(videos, query)
754 class GoogleSearchIE(SearchInfoExtractor):
755 """Information Extractor for Google Video search queries."""
756 _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
758 IE_NAME = u'video.google:search'
759 _SEARCH_KEY = 'gvsearch'
761 def _get_n_results(self, query, n):
762 """Get a specified number of results for a query"""
770 for pagenum in itertools.count(1):
771 result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
772 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
773 note='Downloading result page ' + str(pagenum))
775 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
780 res['entries'].append(e)
782 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
785 class YahooSearchIE(SearchInfoExtractor):
786 """Information Extractor for Yahoo! Video search queries."""
789 IE_NAME = u'screen.yahoo:search'
790 _SEARCH_KEY = 'yvsearch'
792 def _get_n_results(self, query, n):
793 """Get a specified number of results for a query"""
800 for pagenum in itertools.count(0):
801 result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
802 webpage = self._download_webpage(result_url, query,
803 note='Downloading results page '+str(pagenum+1))
804 info = json.loads(webpage)
806 results = info[u'results']
808 for (i, r) in enumerate(results):
809 if (pagenum * 30) +i >= n:
811 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
812 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
813 res['entries'].append(e)
814 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
820 class BlipTVUserIE(InfoExtractor):
821 """Information Extractor for blip.tv users."""
823 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
825 IE_NAME = u'blip.tv:user'
827 def _real_extract(self, url):
829 mobj = re.match(self._VALID_URL, url)
831 raise ExtractorError(u'Invalid URL: %s' % url)
833 username = mobj.group(1)
835 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
837 page = self._download_webpage(url, username, u'Downloading user page')
838 mobj = re.search(r'data-users-id="([^"]+)"', page)
839 page_base = page_base % mobj.group(1)
842 # Download video ids using BlipTV Ajax calls. Result size per
843 # query is limited (currently to 12 videos) so we need to query
844 # page by page until there are no video ids - it means we got
851 url = page_base + "&page=" + str(pagenum)
852 page = self._download_webpage(url, username,
853 u'Downloading video ids from page %d' % pagenum)
855 # Extract video identifiers
858 for mobj in re.finditer(r'href="/([^"]+)"', page):
859 if mobj.group(1) not in ids_in_page:
860 ids_in_page.append(unescapeHTML(mobj.group(1)))
862 video_ids.extend(ids_in_page)
864 # A little optimization - if current page is not
865 # "full", ie. does not contain PAGE_SIZE video ids then
866 # we can assume that this page is the last one - there
867 # are no more ids on further pages - no need to query
870 if len(ids_in_page) < self._PAGE_SIZE:
875 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
876 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
877 return [self.playlist_result(url_entries, playlist_title = username)]
880 class DepositFilesIE(InfoExtractor):
881 """Information extractor for depositfiles.com"""
883 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
885 def _real_extract(self, url):
886 file_id = url.split('/')[-1]
887 # Rebuild url in english locale
888 url = 'http://depositfiles.com/en/files/' + file_id
890 # Retrieve file webpage with 'Free download' button pressed
891 free_download_indication = { 'gateway_result' : '1' }
892 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
894 self.report_download_webpage(file_id)
895 webpage = compat_urllib_request.urlopen(request).read()
896 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
897 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
899 # Search for the real file URL
900 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
901 if (mobj is None) or (mobj.group(1) is None):
902 # Try to figure out reason of the error.
903 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
904 if (mobj is not None) and (mobj.group(1) is not None):
905 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
906 raise ExtractorError(u'%s' % restriction_message)
908 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
910 file_url = mobj.group(1)
911 file_extension = os.path.splitext(file_url)[1][1:]
913 # Search for file title
914 file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
917 'id': file_id.decode('utf-8'),
918 'url': file_url.decode('utf-8'),
922 'ext': file_extension.decode('utf-8'),
926 class FacebookIE(InfoExtractor):
927 """Information Extractor for Facebook"""
929 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
930 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
931 _NETRC_MACHINE = 'facebook'
932 IE_NAME = u'facebook'
934 def report_login(self):
935 """Report attempt to log in."""
936 self.to_screen(u'Logging in')
938 def _real_initialize(self):
939 if self._downloader is None:
944 downloader_params = self._downloader.params
946 # Attempt to use provided username and password or .netrc data
947 if downloader_params.get('username', None) is not None:
948 useremail = downloader_params['username']
949 password = downloader_params['password']
950 elif downloader_params.get('usenetrc', False):
952 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
957 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
958 except (IOError, netrc.NetrcParseError) as err:
959 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
962 if useremail is None:
971 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
974 login_results = compat_urllib_request.urlopen(request).read()
975 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
976 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
978 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
979 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
982 def _real_extract(self, url):
983 mobj = re.match(self._VALID_URL, url)
985 raise ExtractorError(u'Invalid URL: %s' % url)
986 video_id = mobj.group('ID')
988 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
989 webpage = self._download_webpage(url, video_id)
991 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
992 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
993 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
995 raise ExtractorError(u'Cannot parse data')
996 data = dict(json.loads(m.group(1)))
997 params_raw = compat_urllib_parse.unquote(data['params'])
998 params = json.loads(params_raw)
999 video_data = params['video_data'][0]
1000 video_url = video_data.get('hd_src')
1002 video_url = video_data['sd_src']
1004 raise ExtractorError(u'Cannot find video URL')
1005 video_duration = int(video_data['video_duration'])
1006 thumbnail = video_data['thumbnail_src']
1008 video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
1013 'title': video_title,
1016 'duration': video_duration,
1017 'thumbnail': thumbnail,
1022 class BlipTVIE(InfoExtractor):
1023 """Information extractor for blip.tv"""
1025 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
1026 _URL_EXT = r'^.*\.([a-z0-9]+)$'
1027 IE_NAME = u'blip.tv'
1029 def report_direct_download(self, title):
1030 """Report information extraction."""
1031 self.to_screen(u'%s: Direct download detected' % title)
1033 def _real_extract(self, url):
1034 mobj = re.match(self._VALID_URL, url)
1036 raise ExtractorError(u'Invalid URL: %s' % url)
1038 # See https://github.com/rg3/youtube-dl/issues/857
1039 api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
1040 if api_mobj is not None:
1041 url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
1042 urlp = compat_urllib_parse_urlparse(url)
1043 if urlp.path.startswith('/play/'):
1044 request = compat_urllib_request.Request(url)
1045 response = compat_urllib_request.urlopen(request)
1046 redirecturl = response.geturl()
1047 rurlp = compat_urllib_parse_urlparse(redirecturl)
1048 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
1049 url = 'http://blip.tv/a/a-' + file_id
1050 return self._real_extract(url)
1057 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
1058 request = compat_urllib_request.Request(json_url)
1059 request.add_header('User-Agent', 'iTunes/10.6.1')
1060 self.report_extraction(mobj.group(1))
1063 urlh = compat_urllib_request.urlopen(request)
1064 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
1065 basename = url.split('/')[-1]
1066 title,ext = os.path.splitext(basename)
1067 title = title.decode('UTF-8')
1068 ext = ext.replace('.', '')
1069 self.report_direct_download(title)
1074 'upload_date': None,
1079 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1080 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
1081 if info is None: # Regular URL
1083 json_code_bytes = urlh.read()
1084 json_code = json_code_bytes.decode('utf-8')
1085 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1086 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
1089 json_data = json.loads(json_code)
1090 if 'Post' in json_data:
1091 data = json_data['Post']
1095 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
1096 video_url = data['media']['url']
1097 umobj = re.match(self._URL_EXT, video_url)
1099 raise ValueError('Can not determine filename extension')
1100 ext = umobj.group(1)
1103 'id': data['item_id'],
1105 'uploader': data['display_name'],
1106 'upload_date': upload_date,
1107 'title': data['title'],
1109 'format': data['media']['mimeType'],
1110 'thumbnail': data['thumbnailUrl'],
1111 'description': data['description'],
1112 'player_url': data['embedUrl'],
1113 'user_agent': 'iTunes/10.6.1',
1115 except (ValueError,KeyError) as err:
1116 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
1121 class MyVideoIE(InfoExtractor):
1122 """Information Extractor for myvideo.de."""
1124 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
1125 IE_NAME = u'myvideo'
1127 # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
1128 # Released into the Public Domain by Tristan Fischer on 2013-05-19
1129 # https://github.com/rg3/youtube-dl/pull/842
1130 def __rc4crypt(self,data, key):
1132 box = list(range(256))
1133 for i in list(range(256)):
1134 x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
1135 box[i], box[x] = box[x], box[i]
1141 y = (y + box[x]) % 256
1142 box[x], box[y] = box[y], box[x]
1143 out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
1147 return hashlib.md5(s).hexdigest().encode()
1149 def _real_extract(self,url):
1150 mobj = re.match(self._VALID_URL, url)
1152 raise ExtractorError(u'invalid URL: %s' % url)
1154 video_id = mobj.group(1)
1157 b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
1158 b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
1159 b'TnpsbA0KTVRkbU1tSTRNdz09'
1163 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
1164 webpage = self._download_webpage(webpage_url, video_id)
1166 mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
1167 if mobj is not None:
1168 self.report_extraction(video_id)
1169 video_url = mobj.group(1) + '.flv'
1171 video_title = self._html_search_regex('<title>([^<]+)</title>',
1174 video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
1180 'upload_date': None,
1181 'title': video_title,
1186 mobj = re.search('var flashvars={(.+?)}', webpage)
1188 raise ExtractorError(u'Unable to extract video')
1193 for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
1194 if not a == '_encxml':
1197 encxml = compat_urllib_parse.unquote(b)
1198 if not params.get('domain'):
1199 params['domain'] = 'www.myvideo.de'
1200 xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
1201 if 'flash_playertype=MTV' in xmldata_url:
1202 self._downloader.report_warning(u'avoiding MTV player')
1204 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
1205 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
1209 enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
1210 enc_data_b = binascii.unhexlify(enc_data)
1212 base64.b64decode(base64.b64decode(GK)) +
1214 str(video_id).encode('utf-8')
1217 dec_data = self.__rc4crypt(enc_data_b, sk)
1220 self.report_extraction(video_id)
1223 mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
1225 video_url = compat_urllib_parse.unquote(mobj.group(1))
1226 if 'myvideo2flash' in video_url:
1227 self._downloader.report_warning(u'forcing RTMPT ...')
1228 video_url = video_url.replace('rtmpe://', 'rtmpt://')
1231 # extract non rtmp videos
1232 mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
1234 raise ExtractorError(u'unable to extract url')
1235 video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
1237 video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
1238 video_file = compat_urllib_parse.unquote(video_file)
1240 if not video_file.endswith('f4m'):
1241 ppath, prefix = video_file.split('.')
1242 video_playpath = '%s:%s' % (prefix, ppath)
1243 video_hls_playlist = ''
1246 video_hls_playlist = (
1247 video_filepath + video_file
1248 ).replace('.f4m', '.m3u8')
1250 video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
1251 video_swfobj = compat_urllib_parse.unquote(video_swfobj)
1253 video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
1259 'tc_url': video_url,
1261 'upload_date': None,
1262 'title': video_title,
1264 'play_path': video_playpath,
1265 'video_file': video_file,
1266 'video_hls_playlist': video_hls_playlist,
1267 'player_url': video_swfobj,
1271 class ComedyCentralIE(InfoExtractor):
1272 """Information extractor for The Daily Show and Colbert Report """
1274 # urls can be abbreviations like :thedailyshow or :colbert
1275 # urls for episodes like:
1276 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
1277 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
1278 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
1279 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
1280 |(https?://)?(www\.)?
1281 (?P<showname>thedailyshow|colbertnation)\.com/
1282 (full-episodes/(?P<episode>.*)|
1284 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
1285 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
1288 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
1290 _video_extensions = {
1298 _video_dimensions = {
1308 def suitable(cls, url):
1309 """Receives a URL and returns True if suitable for this IE."""
1310 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1312 def _print_formats(self, formats):
1313 print('Available formats:')
1315 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
1318 def _real_extract(self, url):
1319 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1321 raise ExtractorError(u'Invalid URL: %s' % url)
1323 if mobj.group('shortname'):
1324 if mobj.group('shortname') in ('tds', 'thedailyshow'):
1325 url = u'http://www.thedailyshow.com/full-episodes/'
1327 url = u'http://www.colbertnation.com/full-episodes/'
1328 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1329 assert mobj is not None
1331 if mobj.group('clip'):
1332 if mobj.group('showname') == 'thedailyshow':
1333 epTitle = mobj.group('tdstitle')
1335 epTitle = mobj.group('cntitle')
1338 dlNewest = not mobj.group('episode')
1340 epTitle = mobj.group('showname')
1342 epTitle = mobj.group('episode')
1344 self.report_extraction(epTitle)
1345 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
1347 url = htmlHandle.geturl()
1348 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
1350 raise ExtractorError(u'Invalid redirected URL: ' + url)
1351 if mobj.group('episode') == '':
1352 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
1353 epTitle = mobj.group('episode')
1355 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
1357 if len(mMovieParams) == 0:
1358 # The Colbert Report embeds the information in a without
1359 # a URL prefix; so extract the alternate reference
1360 # and then add the URL prefix manually.
1362 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
1363 if len(altMovieParams) == 0:
1364 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
1366 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
1368 uri = mMovieParams[0][1]
1369 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
1370 indexXml = self._download_webpage(indexUrl, epTitle,
1371 u'Downloading show index',
1372 u'unable to download episode index')
1376 idoc = xml.etree.ElementTree.fromstring(indexXml)
1377 itemEls = idoc.findall('.//item')
1378 for partNum,itemEl in enumerate(itemEls):
1379 mediaId = itemEl.findall('./guid')[0].text
1380 shortMediaId = mediaId.split(':')[-1]
1381 showId = mediaId.split(':')[-2].replace('.com', '')
1382 officialTitle = itemEl.findall('./title')[0].text
1383 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
1385 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
1386 compat_urllib_parse.urlencode({'uri': mediaId}))
1387 configXml = self._download_webpage(configUrl, epTitle,
1388 u'Downloading configuration for %s' % shortMediaId)
1390 cdoc = xml.etree.ElementTree.fromstring(configXml)
1392 for rendition in cdoc.findall('.//rendition'):
1393 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
1397 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
1400 if self._downloader.params.get('listformats', None):
1401 self._print_formats([i[0] for i in turls])
1404 # For now, just pick the highest bitrate
1405 format,rtmp_video_url = turls[-1]
1407 # Get the format arg from the arg stream
1408 req_format = self._downloader.params.get('format', None)
1410 # Select format if we can find one
1413 format, rtmp_video_url = f, v
1416 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
1418 raise ExtractorError(u'Cannot transform RTMP url')
1419 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
1420 video_url = base + m.group('finalid')
1422 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
1427 'upload_date': officialDate,
1432 'description': officialTitle,
1434 results.append(info)
1439 class EscapistIE(InfoExtractor):
1440 """Information extractor for The Escapist """
1442 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
1443 IE_NAME = u'escapist'
1445 def _real_extract(self, url):
1446 mobj = re.match(self._VALID_URL, url)
1448 raise ExtractorError(u'Invalid URL: %s' % url)
1449 showName = mobj.group('showname')
1450 videoId = mobj.group('episode')
1452 self.report_extraction(videoId)
1453 webpage = self._download_webpage(url, videoId)
1455 videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
1456 webpage, u'description', fatal=False)
1458 imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
1459 webpage, u'thumbnail', fatal=False)
1461 playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
1462 webpage, u'player url')
1464 title = self._html_search_regex('<meta name="title" content="([^"]*)"',
1465 webpage, u'player url').split(' : ')[-1]
1467 configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
1468 configUrl = compat_urllib_parse.unquote(configUrl)
1470 configJSON = self._download_webpage(configUrl, videoId,
1471 u'Downloading configuration',
1472 u'unable to download configuration')
1474 # Technically, it's JavaScript, not JSON
1475 configJSON = configJSON.replace("'", '"')
1478 config = json.loads(configJSON)
1479 except (ValueError,) as err:
1480 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
1482 playlist = config['playlist']
1483 videoUrl = playlist[1]['url']
1488 'uploader': showName,
1489 'upload_date': None,
1492 'thumbnail': imgUrl,
1493 'description': videoDesc,
1494 'player_url': playerUrl,
1499 class CollegeHumorIE(InfoExtractor):
1500 """Information extractor for collegehumor.com"""
1503 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
1504 IE_NAME = u'collegehumor'
1506 def report_manifest(self, video_id):
1507 """Report information extraction."""
1508 self.to_screen(u'%s: Downloading XML manifest' % video_id)
1510 def _real_extract(self, url):
1511 mobj = re.match(self._VALID_URL, url)
1513 raise ExtractorError(u'Invalid URL: %s' % url)
1514 video_id = mobj.group('videoid')
1519 'upload_date': None,
1522 self.report_extraction(video_id)
1523 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
1525 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
1526 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1527 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1529 mdoc = xml.etree.ElementTree.fromstring(metaXml)
1531 videoNode = mdoc.findall('./video')[0]
1532 info['description'] = videoNode.findall('./description')[0].text
1533 info['title'] = videoNode.findall('./caption')[0].text
1534 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
1535 manifest_url = videoNode.findall('./file')[0].text
1537 raise ExtractorError(u'Invalid metadata XML file')
1539 manifest_url += '?hdcore=2.10.3'
1540 self.report_manifest(video_id)
1542 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
1543 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1544 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1546 adoc = xml.etree.ElementTree.fromstring(manifestXml)
1548 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
1549 node_id = media_node.attrib['url']
1550 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
1551 except IndexError as err:
1552 raise ExtractorError(u'Invalid manifest file')
1554 url_pr = compat_urllib_parse_urlparse(manifest_url)
1555 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
1562 class XVideosIE(InfoExtractor):
1563 """Information extractor for xvideos.com"""
1565 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
1566 IE_NAME = u'xvideos'
1568 def _real_extract(self, url):
1569 mobj = re.match(self._VALID_URL, url)
1571 raise ExtractorError(u'Invalid URL: %s' % url)
1572 video_id = mobj.group(1)
1574 webpage = self._download_webpage(url, video_id)
1576 self.report_extraction(video_id)
1579 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
1580 webpage, u'video URL'))
1583 video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
1586 # Extract video thumbnail
1587 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
1588 webpage, u'thumbnail', fatal=False)
1594 'upload_date': None,
1595 'title': video_title,
1597 'thumbnail': video_thumbnail,
1598 'description': None,
1604 class SoundcloudIE(InfoExtractor):
1605 """Information extractor for soundcloud.com
1606 To access the media, the uid of the song and a stream token
1607 must be extracted from the page source and the script must make
1608 a request to media.soundcloud.com/crossdomain.xml. Then
1609 the media can be grabbed by requesting from an url composed
1610 of the stream token and uid
1613 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
1614 IE_NAME = u'soundcloud'
1616 def report_resolve(self, video_id):
1617 """Report information extraction."""
1618 self.to_screen(u'%s: Resolving id' % video_id)
1620 def _real_extract(self, url):
1621 mobj = re.match(self._VALID_URL, url)
1623 raise ExtractorError(u'Invalid URL: %s' % url)
1625 # extract uploader (which is in the url)
1626 uploader = mobj.group(1)
1627 # extract simple title (uploader + slug of song title)
1628 slug_title = mobj.group(2)
1629 simple_title = uploader + u'-' + slug_title
1630 full_title = '%s/%s' % (uploader, slug_title)
1632 self.report_resolve(full_title)
1634 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
1635 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1636 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
1638 info = json.loads(info_json)
1639 video_id = info['id']
1640 self.report_extraction(full_title)
1642 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1643 stream_json = self._download_webpage(streams_url, full_title,
1644 u'Downloading stream definitions',
1645 u'unable to download stream definitions')
1647 streams = json.loads(stream_json)
1648 mediaURL = streams['http_mp3_128_url']
1649 upload_date = unified_strdate(info['created_at'])
1654 'uploader': info['user']['username'],
1655 'upload_date': upload_date,
1656 'title': info['title'],
1658 'description': info['description'],
1661 class SoundcloudSetIE(InfoExtractor):
1662 """Information extractor for soundcloud.com sets
1663 To access the media, the uid of the song and a stream token
1664 must be extracted from the page source and the script must make
1665 a request to media.soundcloud.com/crossdomain.xml. Then
1666 the media can be grabbed by requesting from an url composed
1667 of the stream token and uid
1670 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
1671 IE_NAME = u'soundcloud:set'
1673 def report_resolve(self, video_id):
1674 """Report information extraction."""
1675 self.to_screen(u'%s: Resolving id' % video_id)
1677 def _real_extract(self, url):
1678 mobj = re.match(self._VALID_URL, url)
1680 raise ExtractorError(u'Invalid URL: %s' % url)
1682 # extract uploader (which is in the url)
1683 uploader = mobj.group(1)
1684 # extract simple title (uploader + slug of song title)
1685 slug_title = mobj.group(2)
1686 simple_title = uploader + u'-' + slug_title
1687 full_title = '%s/sets/%s' % (uploader, slug_title)
1689 self.report_resolve(full_title)
1691 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
1692 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1693 info_json = self._download_webpage(resolv_url, full_title)
1696 info = json.loads(info_json)
1697 if 'errors' in info:
1698 for err in info['errors']:
1699 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
1702 self.report_extraction(full_title)
1703 for track in info['tracks']:
1704 video_id = track['id']
1706 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1707 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
1709 self.report_extraction(video_id)
1710 streams = json.loads(stream_json)
1711 mediaURL = streams['http_mp3_128_url']
1716 'uploader': track['user']['username'],
1717 'upload_date': unified_strdate(track['created_at']),
1718 'title': track['title'],
1720 'description': track['description'],
1725 class InfoQIE(InfoExtractor):
1726 """Information extractor for infoq.com"""
1727 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
1729 def _real_extract(self, url):
1730 mobj = re.match(self._VALID_URL, url)
1732 raise ExtractorError(u'Invalid URL: %s' % url)
1734 webpage = self._download_webpage(url, video_id=url)
1735 self.report_extraction(url)
1738 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
1740 raise ExtractorError(u'Unable to extract video url')
1741 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
1742 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
1745 video_title = self._search_regex(r'contentTitle = "(.*?)";',
1748 # Extract description
1749 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
1750 webpage, u'description', fatal=False)
1752 video_filename = video_url.split('/')[-1]
1753 video_id, extension = video_filename.split('.')
1759 'upload_date': None,
1760 'title': video_title,
1761 'ext': extension, # Extension is always(?) mp4, but seems to be flv
1763 'description': video_description,
1768 class MixcloudIE(InfoExtractor):
1769 """Information extractor for www.mixcloud.com"""
1771 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
1772 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
1773 IE_NAME = u'mixcloud'
1775 def report_download_json(self, file_id):
1776 """Report JSON download."""
1777 self.to_screen(u'Downloading json')
1779 def get_urls(self, jsonData, fmt, bitrate='best'):
1780 """Get urls from 'audio_formats' section in json"""
1783 bitrate_list = jsonData[fmt]
1784 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
1785 bitrate = max(bitrate_list) # select highest
1787 url_list = jsonData[fmt][bitrate]
1788 except TypeError: # we have no bitrate info.
1789 url_list = jsonData[fmt]
1792 def check_urls(self, url_list):
1793 """Returns 1st active url from list"""
1794 for url in url_list:
1796 compat_urllib_request.urlopen(url)
1798 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1803 def _print_formats(self, formats):
1804 print('Available formats:')
1805 for fmt in formats.keys():
1806 for b in formats[fmt]:
1808 ext = formats[fmt][b][0]
1809 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
1810 except TypeError: # we have no bitrate info
1811 ext = formats[fmt][0]
1812 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
1815 def _real_extract(self, url):
1816 mobj = re.match(self._VALID_URL, url)
1818 raise ExtractorError(u'Invalid URL: %s' % url)
1819 # extract uploader & filename from url
1820 uploader = mobj.group(1).decode('utf-8')
1821 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
1823 # construct API request
1824 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
1825 # retrieve .json file with links to files
1826 request = compat_urllib_request.Request(file_url)
1828 self.report_download_json(file_url)
1829 jsonData = compat_urllib_request.urlopen(request).read()
1830 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1831 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
1834 json_data = json.loads(jsonData)
1835 player_url = json_data['player_swf_url']
1836 formats = dict(json_data['audio_formats'])
1838 req_format = self._downloader.params.get('format', None)
1841 if self._downloader.params.get('listformats', None):
1842 self._print_formats(formats)
1845 if req_format is None or req_format == 'best':
1846 for format_param in formats.keys():
1847 url_list = self.get_urls(formats, format_param)
1849 file_url = self.check_urls(url_list)
1850 if file_url is not None:
1853 if req_format not in formats:
1854 raise ExtractorError(u'Format is not available')
1856 url_list = self.get_urls(formats, req_format)
1857 file_url = self.check_urls(url_list)
1858 format_param = req_format
1861 'id': file_id.decode('utf-8'),
1862 'url': file_url.decode('utf-8'),
1863 'uploader': uploader.decode('utf-8'),
1864 'upload_date': None,
1865 'title': json_data['name'],
1866 'ext': file_url.split('.')[-1].decode('utf-8'),
1867 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1868 'thumbnail': json_data['thumbnail_url'],
1869 'description': json_data['description'],
1870 'player_url': player_url.decode('utf-8'),
1873 class StanfordOpenClassroomIE(InfoExtractor):
1874 """Information extractor for Stanford's Open ClassRoom"""
1876 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
1877 IE_NAME = u'stanfordoc'
1879 def _real_extract(self, url):
1880 mobj = re.match(self._VALID_URL, url)
1882 raise ExtractorError(u'Invalid URL: %s' % url)
1884 if mobj.group('course') and mobj.group('video'): # A specific video
1885 course = mobj.group('course')
1886 video = mobj.group('video')
1888 'id': course + '_' + video,
1890 'upload_date': None,
1893 self.report_extraction(info['id'])
1894 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
1895 xmlUrl = baseUrl + video + '.xml'
1897 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
1898 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1899 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1900 mdoc = xml.etree.ElementTree.fromstring(metaXml)
1902 info['title'] = mdoc.findall('./title')[0].text
1903 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
1905 raise ExtractorError(u'Invalid metadata XML file')
1906 info['ext'] = info['url'].rpartition('.')[2]
1908 elif mobj.group('course'): # A course page
1909 course = mobj.group('course')
1914 'upload_date': None,
1917 coursepage = self._download_webpage(url, info['id'],
1918 note='Downloading course info page',
1919 errnote='Unable to download course info page')
1921 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
1923 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
1924 coursepage, u'description', fatal=False)
1926 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
1929 'type': 'reference',
1930 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
1934 for entry in info['list']:
1935 assert entry['type'] == 'reference'
1936 results += self.extract(entry['url'])
1940 'id': 'Stanford OpenClassroom',
1943 'upload_date': None,
1946 self.report_download_webpage(info['id'])
1947 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
1949 rootpage = compat_urllib_request.urlopen(rootURL).read()
1950 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1951 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
1953 info['title'] = info['id']
1955 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
1958 'type': 'reference',
1959 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
1964 for entry in info['list']:
1965 assert entry['type'] == 'reference'
1966 results += self.extract(entry['url'])
1969 class MTVIE(InfoExtractor):
1970 """Information extractor for MTV.com"""
1972 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
1975 def _real_extract(self, url):
1976 mobj = re.match(self._VALID_URL, url)
1978 raise ExtractorError(u'Invalid URL: %s' % url)
1979 if not mobj.group('proto'):
1980 url = 'http://' + url
1981 video_id = mobj.group('videoid')
1983 webpage = self._download_webpage(url, video_id)
1985 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
1986 webpage, u'song name', fatal=False)
1988 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
1991 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
1992 webpage, u'mtvn_uri', fatal=False)
1994 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
1995 webpage, u'content id', fatal=False)
1997 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
1998 self.report_extraction(video_id)
1999 request = compat_urllib_request.Request(videogen_url)
2001 metadataXml = compat_urllib_request.urlopen(request).read()
2002 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
2003 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
2005 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
2006 renditions = mdoc.findall('.//rendition')
2008 # For now, always pick the highest quality.
2009 rendition = renditions[-1]
2012 _,_,ext = rendition.attrib['type'].partition('/')
2013 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
2014 video_url = rendition.find('./src').text
2016 raise ExtractorError('Invalid rendition field.')
2021 'uploader': performer,
2022 'upload_date': None,
2023 'title': video_title,
2031 class YoukuIE(InfoExtractor):
2032 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
2035 nowTime = int(time.time() * 1000)
2036 random1 = random.randint(1000,1998)
2037 random2 = random.randint(1000,9999)
2039 return "%d%d%d" %(nowTime,random1,random2)
2041 def _get_file_ID_mix_string(self, seed):
2043 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
2045 for i in range(len(source)):
2046 seed = (seed * 211 + 30031 ) % 65536
2047 index = math.floor(seed / 65536 * len(source) )
2048 mixed.append(source[int(index)])
2049 source.remove(source[int(index)])
2050 #return ''.join(mixed)
2053 def _get_file_id(self, fileId, seed):
2054 mixed = self._get_file_ID_mix_string(seed)
2055 ids = fileId.split('*')
2059 realId.append(mixed[int(ch)])
2060 return ''.join(realId)
2062 def _real_extract(self, url):
2063 mobj = re.match(self._VALID_URL, url)
2065 raise ExtractorError(u'Invalid URL: %s' % url)
2066 video_id = mobj.group('ID')
2068 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
2070 jsondata = self._download_webpage(info_url, video_id)
2072 self.report_extraction(video_id)
2074 config = json.loads(jsondata)
2076 video_title = config['data'][0]['title']
2077 seed = config['data'][0]['seed']
2079 format = self._downloader.params.get('format', None)
2080 supported_format = list(config['data'][0]['streamfileids'].keys())
2082 if format is None or format == 'best':
2083 if 'hd2' in supported_format:
2088 elif format == 'worst':
2096 fileid = config['data'][0]['streamfileids'][format]
2097 keys = [s['k'] for s in config['data'][0]['segs'][format]]
2098 except (UnicodeDecodeError, ValueError, KeyError):
2099 raise ExtractorError(u'Unable to extract info section')
2102 sid = self._gen_sid()
2103 fileid = self._get_file_id(fileid, seed)
2105 #column 8,9 of fileid represent the segment number
2106 #fileid[7:9] should be changed
2107 for index, key in enumerate(keys):
2109 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
2110 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
2113 'id': '%s_part%02d' % (video_id, index),
2114 'url': download_url,
2116 'upload_date': None,
2117 'title': video_title,
2120 files_info.append(info)
2125 class XNXXIE(InfoExtractor):
2126 """Information extractor for xnxx.com"""
2128 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
2130 VIDEO_URL_RE = r'flv_url=(.*?)&'
2131 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
2132 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
2134 def _real_extract(self, url):
2135 mobj = re.match(self._VALID_URL, url)
2137 raise ExtractorError(u'Invalid URL: %s' % url)
2138 video_id = mobj.group(1)
2140 # Get webpage content
2141 webpage = self._download_webpage(url, video_id)
2143 video_url = self._search_regex(self.VIDEO_URL_RE,
2144 webpage, u'video URL')
2145 video_url = compat_urllib_parse.unquote(video_url)
2147 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
2150 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
2151 webpage, u'thumbnail', fatal=False)
2157 'upload_date': None,
2158 'title': video_title,
2160 'thumbnail': video_thumbnail,
2161 'description': None,
2165 class GooglePlusIE(InfoExtractor):
2166 """Information extractor for plus.google.com."""
2168 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
2169 IE_NAME = u'plus.google'
2171 def _real_extract(self, url):
2172 # Extract id from URL
2173 mobj = re.match(self._VALID_URL, url)
2175 raise ExtractorError(u'Invalid URL: %s' % url)
2177 post_url = mobj.group(0)
2178 video_id = mobj.group(1)
2180 video_extension = 'flv'
2182 # Step 1, Retrieve post webpage to extract further information
2183 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
2185 self.report_extraction(video_id)
2187 # Extract update date
2188 upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
2189 webpage, u'upload date', fatal=False)
2191 # Convert timestring to a format suitable for filename
2192 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
2193 upload_date = upload_date.strftime('%Y%m%d')
2196 uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
2197 webpage, u'uploader', fatal=False)
2200 # Get the first line for title
2201 video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
2202 webpage, 'title', default=u'NA')
2204 # Step 2, Stimulate clicking the image box to launch video
2205 video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
2206 webpage, u'video page URL')
2207 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
2209 # Extract video links on video page
2210 """Extract video links of all sizes"""
2211 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
2212 mobj = re.findall(pattern, webpage)
2214 raise ExtractorError(u'Unable to extract video links')
2216 # Sort in resolution
2217 links = sorted(mobj)
2219 # Choose the lowest of the sort, i.e. highest resolution
2220 video_url = links[-1]
2221 # Only get the url. The resolution part in the tuple has no use anymore
2222 video_url = video_url[-1]
2223 # Treat escaped \u0026 style hex
2225 video_url = video_url.decode("unicode_escape")
2226 except AttributeError: # Python 3
2227 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
2233 'uploader': uploader,
2234 'upload_date': upload_date,
2235 'title': video_title,
2236 'ext': video_extension,
2239 class NBAIE(InfoExtractor):
2240 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
2243 def _real_extract(self, url):
2244 mobj = re.match(self._VALID_URL, url)
2246 raise ExtractorError(u'Invalid URL: %s' % url)
2248 video_id = mobj.group(1)
2250 webpage = self._download_webpage(url, video_id)
2252 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
2254 shortened_video_id = video_id.rpartition('/')[2]
2255 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
2256 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
2258 # It isn't there in the HTML it returns to us
2259 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
2261 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
2264 'id': shortened_video_id,
2268 # 'uploader_date': uploader_date,
2269 'description': description,
2273 class JustinTVIE(InfoExtractor):
2274 """Information extractor for justin.tv and twitch.tv"""
2275 # TODO: One broadcast may be split into multiple videos. The key
2276 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
2277 # starts at 1 and increases. Can we treat all parts as one video?
2279 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
2281 (?P<channelid>[^/]+)|
2282 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
2283 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
2287 _JUSTIN_PAGE_LIMIT = 100
2288 IE_NAME = u'justin.tv'
2290 def report_download_page(self, channel, offset):
2291 """Report attempt to download a single page of videos."""
2292 self.to_screen(u'%s: Downloading video information from %d to %d' %
2293 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
2295 # Return count of items, list of *valid* items
2296 def _parse_page(self, url, video_id):
2297 webpage = self._download_webpage(url, video_id,
2298 u'Downloading video info JSON',
2299 u'unable to download video info JSON')
2301 response = json.loads(webpage)
2302 if type(response) != list:
2303 error_text = response.get('error', 'unknown error')
2304 raise ExtractorError(u'Justin.tv API: %s' % error_text)
2306 for clip in response:
2307 video_url = clip['video_file_url']
2309 video_extension = os.path.splitext(video_url)[1][1:]
2310 video_date = re.sub('-', '', clip['start_time'][:10])
2311 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
2312 video_id = clip['id']
2313 video_title = clip.get('title', video_id)
2317 'title': video_title,
2318 'uploader': clip.get('channel_name', video_uploader_id),
2319 'uploader_id': video_uploader_id,
2320 'upload_date': video_date,
2321 'ext': video_extension,
2323 return (len(response), info)
2325 def _real_extract(self, url):
2326 mobj = re.match(self._VALID_URL, url)
2328 raise ExtractorError(u'invalid URL: %s' % url)
2330 api_base = 'http://api.justin.tv'
2332 if mobj.group('channelid'):
2334 video_id = mobj.group('channelid')
2335 api = api_base + '/channel/archives/%s.json' % video_id
2336 elif mobj.group('chapterid'):
2337 chapter_id = mobj.group('chapterid')
2339 webpage = self._download_webpage(url, chapter_id)
2340 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
2342 raise ExtractorError(u'Cannot find archive of a chapter')
2343 archive_id = m.group(1)
2345 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
2346 chapter_info_xml = self._download_webpage(api, chapter_id,
2347 note=u'Downloading chapter information',
2348 errnote=u'Chapter information download failed')
2349 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
2350 for a in doc.findall('.//archive'):
2351 if archive_id == a.find('./id').text:
2354 raise ExtractorError(u'Could not find chapter in chapter information')
2356 video_url = a.find('./video_file_url').text
2357 video_ext = video_url.rpartition('.')[2] or u'flv'
2359 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
2360 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
2361 note='Downloading chapter metadata',
2362 errnote='Download of chapter metadata failed')
2363 chapter_info = json.loads(chapter_info_json)
2365 bracket_start = int(doc.find('.//bracket_start').text)
2366 bracket_end = int(doc.find('.//bracket_end').text)
2368 # TODO determine start (and probably fix up file)
2369 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
2370 #video_url += u'?start=' + TODO:start_timestamp
2371 # bracket_start is 13290, but we want 51670615
2372 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
2373 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
2376 'id': u'c' + chapter_id,
2379 'title': chapter_info['title'],
2380 'thumbnail': chapter_info['preview'],
2381 'description': chapter_info['description'],
2382 'uploader': chapter_info['channel']['display_name'],
2383 'uploader_id': chapter_info['channel']['name'],
2387 video_id = mobj.group('videoid')
2388 api = api_base + '/broadcast/by_archive/%s.json' % video_id
2390 self.report_extraction(video_id)
2394 limit = self._JUSTIN_PAGE_LIMIT
2397 self.report_download_page(video_id, offset)
2398 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
2399 page_count, page_info = self._parse_page(page_url, video_id)
2400 info.extend(page_info)
2401 if not paged or page_count != limit:
2406 class FunnyOrDieIE(InfoExtractor):
2407 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
2409 def _real_extract(self, url):
2410 mobj = re.match(self._VALID_URL, url)
2412 raise ExtractorError(u'invalid URL: %s' % url)
2414 video_id = mobj.group('id')
2415 webpage = self._download_webpage(url, video_id)
2417 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
2418 webpage, u'video URL', flags=re.DOTALL)
2420 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
2421 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
2423 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
2424 webpage, u'description', fatal=False, flags=re.DOTALL)
2431 'description': video_description,
2435 class SteamIE(InfoExtractor):
2436 _VALID_URL = r"""http://store\.steampowered\.com/
2438 (?P<urltype>video|app)/ #If the page is only for videos or for a game
2440 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
2442 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
2443 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
2446 def suitable(cls, url):
2447 """Receives a URL and returns True if suitable for this IE."""
2448 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2450 def _real_extract(self, url):
2451 m = re.match(self._VALID_URL, url, re.VERBOSE)
2452 gameID = m.group('gameID')
2454 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
2455 webpage = self._download_webpage(videourl, gameID)
2457 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
2458 videourl = self._AGECHECK_TEMPLATE % gameID
2459 self.report_age_confirmation()
2460 webpage = self._download_webpage(videourl, gameID)
2462 self.report_extraction(gameID)
2463 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
2464 webpage, 'game title')
2466 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
2467 mweb = re.finditer(urlRE, webpage)
2468 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
2469 titles = re.finditer(namesRE, webpage)
2470 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
2471 thumbs = re.finditer(thumbsRE, webpage)
2473 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
2474 video_id = vid.group('videoID')
2475 title = vtitle.group('videoName')
2476 video_url = vid.group('videoURL')
2477 video_thumb = thumb.group('thumbnail')
2479 raise ExtractorError(u'Cannot find video url for %s' % video_id)
2484 'title': unescapeHTML(title),
2485 'thumbnail': video_thumb
2488 return [self.playlist_result(videos, gameID, game_title)]
2490 class UstreamIE(InfoExtractor):
2491 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
2492 IE_NAME = u'ustream'
2494 def _real_extract(self, url):
2495 m = re.match(self._VALID_URL, url)
2496 video_id = m.group('videoID')
2498 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
2499 webpage = self._download_webpage(url, video_id)
2501 self.report_extraction(video_id)
2503 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
2506 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
2507 webpage, u'uploader', fatal=False, flags=re.DOTALL)
2509 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
2510 webpage, u'thumbnail', fatal=False)
2516 'title': video_title,
2517 'uploader': uploader,
2518 'thumbnail': thumbnail,
2522 class WorldStarHipHopIE(InfoExtractor):
2523 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
2524 IE_NAME = u'WorldStarHipHop'
2526 def _real_extract(self, url):
2527 m = re.match(self._VALID_URL, url)
2528 video_id = m.group('id')
2530 webpage_src = self._download_webpage(url, video_id)
2532 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
2533 webpage_src, u'video URL')
2535 if 'mp4' in video_url:
2540 video_title = self._html_search_regex(r"<title>(.*)</title>",
2541 webpage_src, u'title')
2543 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
2544 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
2545 webpage_src, u'thumbnail', fatal=False)
2548 _title = r"""candytitles.*>(.*)</span>"""
2549 mobj = re.search(_title, webpage_src)
2550 if mobj is not None:
2551 video_title = mobj.group(1)
2556 'title' : video_title,
2557 'thumbnail' : thumbnail,
2562 class RBMARadioIE(InfoExtractor):
2563 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
2565 def _real_extract(self, url):
2566 m = re.match(self._VALID_URL, url)
2567 video_id = m.group('videoID')
2569 webpage = self._download_webpage(url, video_id)
2571 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
2572 webpage, u'json data', flags=re.MULTILINE)
2575 data = json.loads(json_data)
2576 except ValueError as e:
2577 raise ExtractorError(u'Invalid JSON: ' + str(e))
2579 video_url = data['akamai_url'] + '&cbr=256'
2580 url_parts = compat_urllib_parse_urlparse(video_url)
2581 video_ext = url_parts.path.rpartition('.')[2]
2586 'title': data['title'],
2587 'description': data.get('teaser_text'),
2588 'location': data.get('country_of_origin'),
2589 'uploader': data.get('host', {}).get('name'),
2590 'uploader_id': data.get('host', {}).get('slug'),
2591 'thumbnail': data.get('image', {}).get('large_url_2x'),
2592 'duration': data.get('duration'),
2597 class YouPornIE(InfoExtractor):
2598 """Information extractor for youporn.com."""
2599 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
2601 def _print_formats(self, formats):
2602 """Print all available formats"""
2603 print(u'Available formats:')
2604 print(u'ext\t\tformat')
2605 print(u'---------------------------------')
2606 for format in formats:
2607 print(u'%s\t\t%s' % (format['ext'], format['format']))
2609 def _specific(self, req_format, formats):
2611 if(x["format"]==req_format):
2615 def _real_extract(self, url):
2616 mobj = re.match(self._VALID_URL, url)
2618 raise ExtractorError(u'Invalid URL: %s' % url)
2619 video_id = mobj.group('videoid')
2621 req = compat_urllib_request.Request(url)
2622 req.add_header('Cookie', 'age_verified=1')
2623 webpage = self._download_webpage(req, video_id)
2625 # Get JSON parameters
2626 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
2628 params = json.loads(json_params)
2630 raise ExtractorError(u'Invalid JSON')
2632 self.report_extraction(video_id)
2634 video_title = params['title']
2635 upload_date = unified_strdate(params['release_date_f'])
2636 video_description = params['description']
2637 video_uploader = params['submitted_by']
2638 thumbnail = params['thumbnails'][0]['image']
2640 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
2642 # Get all of the formats available
2643 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
2644 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
2645 webpage, u'download list').strip()
2647 # Get all of the links from the page
2648 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
2649 links = re.findall(LINK_RE, download_list_html)
2650 if(len(links) == 0):
2651 raise ExtractorError(u'ERROR: no known formats available for video')
2653 self.to_screen(u'Links found: %d' % len(links))
2658 # A link looks like this:
2659 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
2660 # A path looks like this:
2661 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
2662 video_url = unescapeHTML( link )
2663 path = compat_urllib_parse_urlparse( video_url ).path
2664 extension = os.path.splitext( path )[1][1:]
2665 format = path.split('/')[4].split('_')[:2]
2668 format = "-".join( format )
2669 # title = u'%s-%s-%s' % (video_title, size, bitrate)
2674 'uploader': video_uploader,
2675 'upload_date': upload_date,
2676 'title': video_title,
2679 'thumbnail': thumbnail,
2680 'description': video_description
2683 if self._downloader.params.get('listformats', None):
2684 self._print_formats(formats)
2687 req_format = self._downloader.params.get('format', None)
2688 self.to_screen(u'Format: %s' % req_format)
2690 if req_format is None or req_format == 'best':
2692 elif req_format == 'worst':
2693 return [formats[-1]]
2694 elif req_format in ('-1', 'all'):
2697 format = self._specific( req_format, formats )
2699 raise ExtractorError(u'Requested format not available')
2704 class PornotubeIE(InfoExtractor):
2705 """Information extractor for pornotube.com."""
2706 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
2708 def _real_extract(self, url):
2709 mobj = re.match(self._VALID_URL, url)
2711 raise ExtractorError(u'Invalid URL: %s' % url)
2713 video_id = mobj.group('videoid')
2714 video_title = mobj.group('title')
2716 # Get webpage content
2717 webpage = self._download_webpage(url, video_id)
2720 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
2721 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
2722 video_url = compat_urllib_parse.unquote(video_url)
2724 #Get the uploaded date
2725 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
2726 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
2727 if upload_date: upload_date = unified_strdate(upload_date)
2729 info = {'id': video_id,
2732 'upload_date': upload_date,
2733 'title': video_title,
2739 class YouJizzIE(InfoExtractor):
2740 """Information extractor for youjizz.com."""
2741 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
2743 def _real_extract(self, url):
2744 mobj = re.match(self._VALID_URL, url)
2746 raise ExtractorError(u'Invalid URL: %s' % url)
2748 video_id = mobj.group('videoid')
2750 # Get webpage content
2751 webpage = self._download_webpage(url, video_id)
2753 # Get the video title
2754 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
2755 webpage, u'title').strip()
2757 # Get the embed page
2758 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
2760 raise ExtractorError(u'ERROR: unable to extract embed page')
2762 embed_page_url = result.group(0).strip()
2763 video_id = result.group('videoid')
2765 webpage = self._download_webpage(embed_page_url, video_id)
2768 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
2769 webpage, u'video URL')
2771 info = {'id': video_id,
2773 'title': video_title,
2776 'player_url': embed_page_url}
2780 class EightTracksIE(InfoExtractor):
2782 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
2784 def _real_extract(self, url):
2785 mobj = re.match(self._VALID_URL, url)
2787 raise ExtractorError(u'Invalid URL: %s' % url)
2788 playlist_id = mobj.group('id')
2790 webpage = self._download_webpage(url, playlist_id)
2792 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
2793 data = json.loads(json_like)
2795 session = str(random.randint(0, 1000000000))
2797 track_count = data['tracks_count']
2798 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
2799 next_url = first_url
2801 for i in itertools.count():
2802 api_json = self._download_webpage(next_url, playlist_id,
2803 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
2804 errnote=u'Failed to download song information')
2805 api_data = json.loads(api_json)
2806 track_data = api_data[u'set']['track']
2808 'id': track_data['id'],
2809 'url': track_data['track_file_stream_url'],
2810 'title': track_data['performer'] + u' - ' + track_data['name'],
2811 'raw_title': track_data['name'],
2812 'uploader_id': data['user']['login'],
2816 if api_data['set']['at_last_track']:
2818 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
2821 class KeekIE(InfoExtractor):
2822 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
2825 def _real_extract(self, url):
2826 m = re.match(self._VALID_URL, url)
2827 video_id = m.group('videoID')
2829 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
2830 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
2831 webpage = self._download_webpage(url, video_id)
2833 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2836 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
2837 webpage, u'uploader', fatal=False)
2843 'title': video_title,
2844 'thumbnail': thumbnail,
2845 'uploader': uploader
2849 class TEDIE(InfoExtractor):
2850 _VALID_URL=r'''http://www\.ted\.com/
2852 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
2854 ((?P<type_talk>talks)) # We have a simple talk
2856 (/lang/(.*?))? # The url may contain the language
2857 /(?P<name>\w+) # Here goes the name and then ".html"
2861 def suitable(cls, url):
2862 """Receives a URL and returns True if suitable for this IE."""
2863 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2865 def _real_extract(self, url):
2866 m=re.match(self._VALID_URL, url, re.VERBOSE)
2867 if m.group('type_talk'):
2868 return [self._talk_info(url)]
2870 playlist_id=m.group('playlist_id')
2871 name=m.group('name')
2872 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
2873 return [self._playlist_videos_info(url,name,playlist_id)]
2875 def _playlist_videos_info(self,url,name,playlist_id=0):
2876 '''Returns the videos of the playlist'''
2878 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
2879 ([.\s]*?)data-playlist_item_id="(\d+)"
2880 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
2882 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
2883 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
2884 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
2885 m_names=re.finditer(video_name_RE,webpage)
2887 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
2888 webpage, 'playlist title')
2890 playlist_entries = []
2891 for m_video, m_name in zip(m_videos,m_names):
2892 video_id=m_video.group('video_id')
2893 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
2894 playlist_entries.append(self.url_result(talk_url, 'TED'))
2895 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
2897 def _talk_info(self, url, video_id=0):
2898 """Return the video for the talk in the url"""
2899 m = re.match(self._VALID_URL, url,re.VERBOSE)
2900 video_name = m.group('name')
2901 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
2902 self.report_extraction(video_name)
2903 # If the url includes the language we get the title translated
2904 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
2906 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
2907 webpage, 'json data')
2908 info = json.loads(json_data)
2909 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
2910 webpage, 'description', flags = re.DOTALL)
2912 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
2913 webpage, 'thumbnail')
2916 'url': info['htmlStreams'][-1]['file'],
2919 'thumbnail': thumbnail,
2920 'description': desc,
2924 class MySpassIE(InfoExtractor):
2925 _VALID_URL = r'http://www.myspass.de/.*'
2927 def _real_extract(self, url):
2928 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
2930 # video id is the last path element of the URL
2931 # usually there is a trailing slash, so also try the second but last
2932 url_path = compat_urllib_parse_urlparse(url).path
2933 url_parent_path, video_id = os.path.split(url_path)
2935 _, video_id = os.path.split(url_parent_path)
2938 metadata_url = META_DATA_URL_TEMPLATE % video_id
2939 metadata_text = self._download_webpage(metadata_url, video_id)
2940 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
2942 # extract values from metadata
2943 url_flv_el = metadata.find('url_flv')
2944 if url_flv_el is None:
2945 raise ExtractorError(u'Unable to extract download url')
2946 video_url = url_flv_el.text
2947 extension = os.path.splitext(video_url)[1][1:]
2948 title_el = metadata.find('title')
2949 if title_el is None:
2950 raise ExtractorError(u'Unable to extract title')
2951 title = title_el.text
2952 format_id_el = metadata.find('format_id')
2953 if format_id_el is None:
2956 format = format_id_el.text
2957 description_el = metadata.find('description')
2958 if description_el is not None:
2959 description = description_el.text
2962 imagePreview_el = metadata.find('imagePreview')
2963 if imagePreview_el is not None:
2964 thumbnail = imagePreview_el.text
2973 'thumbnail': thumbnail,
2974 'description': description
2978 class SpiegelIE(InfoExtractor):
2979 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
2981 def _real_extract(self, url):
2982 m = re.match(self._VALID_URL, url)
2983 video_id = m.group('videoID')
2985 webpage = self._download_webpage(url, video_id)
2987 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
2990 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
2991 xml_code = self._download_webpage(xml_url, video_id,
2992 note=u'Downloading XML', errnote=u'Failed to download XML')
2994 idoc = xml.etree.ElementTree.fromstring(xml_code)
2995 last_type = idoc[-1]
2996 filename = last_type.findall('./filename')[0].text
2997 duration = float(last_type.findall('./duration')[0].text)
2999 video_url = 'http://video2.spiegel.de/flash/' + filename
3000 video_ext = filename.rpartition('.')[2]
3005 'title': video_title,
3006 'duration': duration,
3010 class LiveLeakIE(InfoExtractor):
3012 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
3013 IE_NAME = u'liveleak'
3015 def _real_extract(self, url):
3016 mobj = re.match(self._VALID_URL, url)
3018 raise ExtractorError(u'Invalid URL: %s' % url)
3020 video_id = mobj.group('video_id')
3022 webpage = self._download_webpage(url, video_id)
3024 video_url = self._search_regex(r'file: "(.*?)",',
3025 webpage, u'video URL')
3027 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
3028 webpage, u'title').replace('LiveLeak.com -', '').strip()
3030 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
3031 webpage, u'description', fatal=False)
3033 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
3034 webpage, u'uploader', fatal=False)
3040 'title': video_title,
3041 'description': video_description,
3042 'uploader': video_uploader
3047 class ARDIE(InfoExtractor):
3048 _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3049 _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
3050 _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
3052 def _real_extract(self, url):
3053 # determine video id from url
3054 m = re.match(self._VALID_URL, url)
3056 numid = re.search(r'documentId=([0-9]+)', url)
3058 video_id = numid.group(1)
3060 video_id = m.group('video_id')
3062 # determine title and media streams from webpage
3063 html = self._download_webpage(url, video_id)
3064 title = re.search(self._TITLE, html).group('title')
3065 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3067 assert '"fsk"' in html
3068 raise ExtractorError(u'This video is only available after 8:00 pm')
3070 # choose default media type and highest quality for now
3071 stream = max([s for s in streams if int(s["media_type"]) == 0],
3072 key=lambda s: int(s["quality"]))
3074 # there's two possibilities: RTMP stream or HTTP download
3075 info = {'id': video_id, 'title': title, 'ext': 'mp4'}
3076 if stream['rtmp_url']:
3077 self.to_screen(u'RTMP download detected')
3078 assert stream['video_url'].startswith('mp4:')
3079 info["url"] = stream["rtmp_url"]
3080 info["play_path"] = stream['video_url']
3082 assert stream["video_url"].endswith('.mp4')
3083 info["url"] = stream["video_url"]
3086 class ZDFIE(InfoExtractor):
3087 _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
3088 _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
3089 _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
3090 _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
3091 _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
3093 def _real_extract(self, url):
3094 mobj = re.match(self._VALID_URL, url)
3096 raise ExtractorError(u'Invalid URL: %s' % url)
3097 video_id = mobj.group('video_id')
3099 html = self._download_webpage(url, video_id)
3100 streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
3102 raise ExtractorError(u'No media url found.')
3104 # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
3105 # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
3106 # choose first/default media type and highest quality for now
3107 for s in streams: #find 300 - dsl1000mbit
3108 if s['quality'] == '300' and s['media_type'] == 'wstreaming':
3111 for s in streams: #find veryhigh - dsl2000mbit
3112 if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
3116 raise ExtractorError(u'No stream found.')
3118 media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
3120 self.report_extraction(video_id)
3121 mobj = re.search(self._TITLE, html)
3123 raise ExtractorError(u'Cannot extract title')
3124 title = unescapeHTML(mobj.group('title'))
3126 mobj = re.search(self._MMS_STREAM, media_link)
3128 mobj = re.search(self._RTSP_STREAM, media_link)
3130 raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
3131 mms_url = mobj.group('video_url')
3133 mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
3135 raise ExtractorError(u'Cannot extract extention')
3136 ext = mobj.group('ext')
3138 return [{'id': video_id,
3144 class TumblrIE(InfoExtractor):
3145 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
3147 def _real_extract(self, url):
3148 m_url = re.match(self._VALID_URL, url)
3149 video_id = m_url.group('id')
3150 blog = m_url.group('blog_name')
3152 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
3153 webpage = self._download_webpage(url, video_id)
3155 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
3156 video = re.search(re_video, webpage)
3158 raise ExtractorError(u'Unable to extract video')
3159 video_url = video.group('video_url')
3160 ext = video.group('ext')
3162 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
3163 webpage, u'thumbnail', fatal=False) # We pick the first poster
3164 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
3166 # The only place where you can get a title, it's not complete,
3167 # but searching in other places doesn't work for all videos
3168 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
3169 webpage, u'title', flags=re.DOTALL)
3171 return [{'id': video_id,
3173 'title': video_title,
3174 'thumbnail': video_thumbnail,
3178 class BandcampIE(InfoExtractor):
3179 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
3181 def _real_extract(self, url):
3182 mobj = re.match(self._VALID_URL, url)
3183 title = mobj.group('title')
3184 webpage = self._download_webpage(url, title)
3185 # We get the link to the free download page
3186 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
3187 if m_download is None:
3188 raise ExtractorError(u'No free songs found')
3190 download_link = m_download.group(1)
3191 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
3192 webpage, re.MULTILINE|re.DOTALL).group('id')
3194 download_webpage = self._download_webpage(download_link, id,
3195 'Downloading free downloads page')
3196 # We get the dictionary of the track from some javascrip code
3197 info = re.search(r'items: (.*?),$',
3198 download_webpage, re.MULTILINE).group(1)
3199 info = json.loads(info)[0]
3200 # We pick mp3-320 for now, until format selection can be easily implemented.
3201 mp3_info = info[u'downloads'][u'mp3-320']
3202 # If we try to use this url it says the link has expired
3203 initial_url = mp3_info[u'url']
3204 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
3205 m_url = re.match(re_url, initial_url)
3206 #We build the url we will use to get the final track url
3207 # This url is build in Bandcamp in the script download_bunde_*.js
3208 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
3209 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
3210 # If we could correctly generate the .rand field the url would be
3211 #in the "download_url" key
3212 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
3214 track_info = {'id':id,
3215 'title' : info[u'title'],
3218 'thumbnail' : info[u'thumb_url'],
3219 'uploader' : info[u'artist']
3224 class RedTubeIE(InfoExtractor):
3225 """Information Extractor for redtube"""
3226 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
3228 def _real_extract(self,url):
3229 mobj = re.match(self._VALID_URL, url)
3231 raise ExtractorError(u'Invalid URL: %s' % url)
3233 video_id = mobj.group('id')
3234 video_extension = 'mp4'
3235 webpage = self._download_webpage(url, video_id)
3237 self.report_extraction(video_id)
3239 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
3240 webpage, u'video URL')
3242 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
3248 'ext': video_extension,
3249 'title': video_title,
3252 class InaIE(InfoExtractor):
3253 """Information Extractor for Ina.fr"""
3254 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
3256 def _real_extract(self,url):
3257 mobj = re.match(self._VALID_URL, url)
3259 video_id = mobj.group('id')
3260 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
3261 video_extension = 'mp4'
3262 webpage = self._download_webpage(mrss_url, video_id)
3264 self.report_extraction(video_id)
3266 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
3267 webpage, u'video URL')
3269 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
3275 'ext': video_extension,
3276 'title': video_title,
3279 class HowcastIE(InfoExtractor):
3280 """Information Extractor for Howcast.com"""
3281 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
3283 def _real_extract(self, url):
3284 mobj = re.match(self._VALID_URL, url)
3286 video_id = mobj.group('id')
3287 webpage_url = 'http://www.howcast.com/videos/' + video_id
3288 webpage = self._download_webpage(webpage_url, video_id)
3290 self.report_extraction(video_id)
3292 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
3293 webpage, u'video URL')
3295 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
3298 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
3299 webpage, u'description', fatal=False)
3301 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
3302 webpage, u'thumbnail', fatal=False)
3308 'title': video_title,
3309 'description': video_description,
3310 'thumbnail': thumbnail,
3313 class VineIE(InfoExtractor):
3314 """Information Extractor for Vine.co"""
3315 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
3317 def _real_extract(self, url):
3318 mobj = re.match(self._VALID_URL, url)
3320 video_id = mobj.group('id')
3321 webpage_url = 'https://vine.co/v/' + video_id
3322 webpage = self._download_webpage(webpage_url, video_id)
3324 self.report_extraction(video_id)
3326 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
3327 webpage, u'video URL')
3329 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
3332 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
3333 webpage, u'thumbnail', fatal=False)
3335 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
3336 webpage, u'uploader', fatal=False, flags=re.DOTALL)
3342 'title': video_title,
3343 'thumbnail': thumbnail,
3344 'uploader': uploader,
3347 class FlickrIE(InfoExtractor):
3348 """Information Extractor for Flickr videos"""
3349 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
3351 def _real_extract(self, url):
3352 mobj = re.match(self._VALID_URL, url)
3354 video_id = mobj.group('id')
3355 video_uploader_id = mobj.group('uploader_id')
3356 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
3357 webpage = self._download_webpage(webpage_url, video_id)
3359 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
3361 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
3362 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
3364 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
3365 first_xml, u'node_id')
3367 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
3368 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
3370 self.report_extraction(video_id)
3372 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
3374 raise ExtractorError(u'Unable to extract video url')
3375 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
3377 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
3378 webpage, u'video title')
3380 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
3381 webpage, u'description', fatal=False)
3383 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
3384 webpage, u'thumbnail', fatal=False)
3390 'title': video_title,
3391 'description': video_description,
3392 'thumbnail': thumbnail,
3393 'uploader_id': video_uploader_id,
3396 class TeamcocoIE(InfoExtractor):
3397 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
3399 def _real_extract(self, url):
3400 mobj = re.match(self._VALID_URL, url)
3402 raise ExtractorError(u'Invalid URL: %s' % url)
3403 url_title = mobj.group('url_title')
3404 webpage = self._download_webpage(url, url_title)
3406 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
3407 webpage, u'video id')
3409 self.report_extraction(video_id)
3411 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
3414 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
3415 webpage, u'thumbnail', fatal=False)
3417 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
3418 webpage, u'description', fatal=False)
3420 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
3421 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
3423 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
3430 'title': video_title,
3431 'thumbnail': thumbnail,
3432 'description': video_description,
3435 class XHamsterIE(InfoExtractor):
3436 """Information Extractor for xHamster"""
3437 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
3439 def _real_extract(self,url):
3440 mobj = re.match(self._VALID_URL, url)
3442 video_id = mobj.group('id')
3443 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
3444 webpage = self._download_webpage(mrss_url, video_id)
3446 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
3448 raise ExtractorError(u'Unable to extract media URL')
3449 if len(mobj.group('server')) == 0:
3450 video_url = compat_urllib_parse.unquote(mobj.group('file'))
3452 video_url = mobj.group('server')+'/key='+mobj.group('file')
3453 video_extension = video_url.split('.')[-1]
3455 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
3458 # Can't see the description anywhere in the UI
3459 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
3460 # webpage, u'description', fatal=False)
3461 # if video_description: video_description = unescapeHTML(video_description)
3463 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
3465 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
3467 video_upload_date = None
3468 self._downloader.report_warning(u'Unable to extract upload date')
3470 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
3471 webpage, u'uploader id', default=u'anonymous')
3473 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
3474 webpage, u'thumbnail', fatal=False)
3479 'ext': video_extension,
3480 'title': video_title,
3481 # 'description': video_description,
3482 'upload_date': video_upload_date,
3483 'uploader_id': video_uploader_id,
3484 'thumbnail': video_thumbnail
3487 class HypemIE(InfoExtractor):
3488 """Information Extractor for hypem"""
3489 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
3491 def _real_extract(self, url):
3492 mobj = re.match(self._VALID_URL, url)
3494 raise ExtractorError(u'Invalid URL: %s' % url)
3495 track_id = mobj.group(1)
3497 data = { 'ax': 1, 'ts': time.time() }
3498 data_encoded = compat_urllib_parse.urlencode(data)
3499 complete_url = url + "?" + data_encoded
3500 request = compat_urllib_request.Request(complete_url)
3501 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
3502 cookie = urlh.headers.get('Set-Cookie', '')
3504 self.report_extraction(track_id)
3506 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
3507 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
3509 track_list = json.loads(html_tracks)
3510 track = track_list[u'tracks'][0]
3512 raise ExtractorError(u'Hypemachine contained invalid JSON.')
3515 track_id = track[u"id"]
3516 artist = track[u"artist"]
3517 title = track[u"song"]
3519 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
3520 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
3521 request.add_header('cookie', cookie)
3522 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
3524 song_data = json.loads(song_data_json)
3526 raise ExtractorError(u'Hypemachine contained invalid JSON.')
3527 final_url = song_data[u"url"]
3537 class Vbox7IE(InfoExtractor):
3538 """Information Extractor for Vbox7"""
3539 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
3541 def _real_extract(self,url):
3542 mobj = re.match(self._VALID_URL, url)
3544 raise ExtractorError(u'Invalid URL: %s' % url)
3545 video_id = mobj.group(1)
3547 redirect_page, urlh = self._download_webpage_handle(url, video_id)
3548 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
3549 redirect_url = urlh.geturl() + new_location
3550 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
3552 title = self._html_search_regex(r'<title>(.*)</title>',
3553 webpage, u'title').split('/')[0].strip()
3556 info_url = "http://vbox7.com/play/magare.do"
3557 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
3558 info_request = compat_urllib_request.Request(info_url, data)
3559 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
3560 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
3561 if info_response is None:
3562 raise ExtractorError(u'Unable to extract the media url')
3563 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
3570 'thumbnail': thumbnail_url,
3573 class GametrailersIE(InfoExtractor):
3574 _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
3576 def _real_extract(self, url):
3577 mobj = re.match(self._VALID_URL, url)
3579 raise ExtractorError(u'Invalid URL: %s' % url)
3580 video_id = mobj.group('id')
3581 video_type = mobj.group('type')
3582 webpage = self._download_webpage(url, video_id)
3583 if video_type == 'full-episodes':
3584 mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
3586 mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
3587 mgid = self._search_regex(mgid_re, webpage, u'mgid')
3588 data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
3590 info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
3591 video_id, u'Downloading video info')
3592 links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
3593 video_id, u'Downloading video urls info')
3595 self.report_extraction(video_id)
3596 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
3597 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
3599 <url>(?P<thumb>.*?)</url>.*
3602 m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
3604 raise ExtractorError(u'Unable to extract video info')
3605 video_title = m_info.group('title')
3606 video_description = m_info.group('description')
3607 video_thumb = m_info.group('thumb')
3609 m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
3610 if m_urls is None or len(m_urls) == 0:
3611 raise ExtractError(u'Unable to extrat video url')
3612 # They are sorted from worst to best quality
3613 video_url = m_urls[-1].group('url')
3615 return {'url': video_url,
3617 'title': video_title,
3618 # Videos are actually flv not mp4
3620 'thumbnail': video_thumb,
3621 'description': video_description,
3624 class StatigramIE(InfoExtractor):
3625 _VALID_URL = r'(?:http://)?(?:www\.)?statigr\.am/p/([^/]+)'
3627 def _real_extract(self, url):
3628 mobj = re.match(self._VALID_URL, url)
3630 video_id = mobj.group(1)
3631 webpage = self._download_webpage(url, video_id)
3632 video_url = self._html_search_regex(
3633 r'<meta property="og:video:secure_url" content="(.+?)">',
3634 webpage, u'video URL')
3635 thumbnail_url = self._html_search_regex(
3636 r'<meta property="og:image" content="(.+?)" />',
3637 webpage, u'thumbnail URL', fatal=False)
3638 html_title = self._html_search_regex(
3639 r'<title>(.+?)</title>',
3641 title = html_title.rpartition(u' | Statigram')[0]
3642 uploader_id = self._html_search_regex(
3643 r'@([^ ]+)', title, u'uploader name', fatal=False)
3651 'thumbnail': thumbnail_url,
3652 'uploader_id' : uploader_id
3655 def gen_extractors():
3656 """ Return a list of an instance of every supported extractor.
3657 The order does matter; the first extractor matched is the one handling the URL.
3660 YoutubePlaylistIE(),
3685 StanfordOpenClassroomIE(),
3695 WorldStarHipHopIE(),
3725 def get_info_extractor(ie_name):
3726 """Returns the info extractor class with the given ie_name"""
3727 return globals()[ie_name+'IE']