2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
24 from .extractor.common import InfoExtractor, SearchInfoExtractor
26 from .extractor.ard import ARDIE
27 from .extractor.arte import ArteTvIE
28 from .extractor.dailymotion import DailymotionIE
29 from .extractor.metacafe import MetacafeIE
30 from .extractor.statigram import StatigramIE
31 from .extractor.photobucket import PhotobucketIE
32 from .extractor.vimeo import VimeoIE
33 from .extractor.yahoo import YahooIE
34 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeUserIE, YoutubeChannelIE
35 from .extractor.zdf import ZDFIE
47 class GenericIE(InfoExtractor):
48 """Generic last-resort information extractor."""
53 def report_download_webpage(self, video_id):
54 """Report webpage download."""
55 if not self._downloader.params.get('test', False):
56 self._downloader.report_warning(u'Falling back on generic information extractor.')
57 super(GenericIE, self).report_download_webpage(video_id)
59 def report_following_redirect(self, new_url):
60 """Report information extraction."""
61 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
63 def _test_redirect(self, url):
64 """Check if it is a redirect, like url shorteners, in case return the new url."""
65 class HeadRequest(compat_urllib_request.Request):
69 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
71 Subclass the HTTPRedirectHandler to make it use our
72 HeadRequest also on the redirected URL
74 def redirect_request(self, req, fp, code, msg, headers, newurl):
75 if code in (301, 302, 303, 307):
76 newurl = newurl.replace(' ', '%20')
77 newheaders = dict((k,v) for k,v in req.headers.items()
78 if k.lower() not in ("content-length", "content-type"))
79 return HeadRequest(newurl,
81 origin_req_host=req.get_origin_req_host(),
84 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
86 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
88 Fallback to GET if HEAD is not allowed (405 HTTP error)
90 def http_error_405(self, req, fp, code, msg, headers):
94 newheaders = dict((k,v) for k,v in req.headers.items()
95 if k.lower() not in ("content-length", "content-type"))
96 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
98 origin_req_host=req.get_origin_req_host(),
102 opener = compat_urllib_request.OpenerDirector()
103 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
104 HTTPMethodFallback, HEADRedirectHandler,
105 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
106 opener.add_handler(handler())
108 response = opener.open(HeadRequest(url))
110 raise ExtractorError(u'Invalid URL protocol')
111 new_url = response.geturl()
116 self.report_following_redirect(new_url)
119 def _real_extract(self, url):
120 new_url = self._test_redirect(url)
121 if new_url: return [self.url_result(new_url)]
123 video_id = url.split('/')[-1]
125 webpage = self._download_webpage(url, video_id)
126 except ValueError as err:
127 # since this is the last-resort InfoExtractor, if
128 # this error is thrown, it'll be thrown here
129 raise ExtractorError(u'Invalid URL: %s' % url)
131 self.report_extraction(video_id)
132 # Start with something easy: JW Player in SWFObject
133 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
135 # Broaden the search a little bit
136 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
138 # Broaden the search a little bit: JWPlayer JS loader
139 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
141 # Try to find twitter cards info
142 mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
144 # We look for Open Graph info:
145 # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
146 m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
147 # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
148 if m_video_type is not None:
149 mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
151 raise ExtractorError(u'Invalid URL: %s' % url)
153 # It's possible that one of the regexes
154 # matched, but returned an empty group:
155 if mobj.group(1) is None:
156 raise ExtractorError(u'Invalid URL: %s' % url)
158 video_url = compat_urllib_parse.unquote(mobj.group(1))
159 video_id = os.path.basename(video_url)
161 # here's a fun little line of code for you:
162 video_extension = os.path.splitext(video_id)[1][1:]
163 video_id = os.path.splitext(video_id)[0]
165 # it's tempting to parse this further, but you would
166 # have to take into account all the variations like
167 # Video Title - Site Name
168 # Site Name | Video Title
169 # Video Title - Tagline | Site Name
170 # and so on and so forth; it's just not practical
171 video_title = self._html_search_regex(r'<title>(.*)</title>',
172 webpage, u'video title')
174 # video uploader is domain name
175 video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
176 url, u'video uploader')
181 'uploader': video_uploader,
183 'title': video_title,
184 'ext': video_extension,
188 class YoutubeSearchIE(SearchInfoExtractor):
189 """Information Extractor for YouTube search queries."""
190 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
192 IE_NAME = u'youtube:search'
193 _SEARCH_KEY = 'ytsearch'
195 def report_download_page(self, query, pagenum):
196 """Report attempt to download search page with given number."""
197 self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
199 def _get_n_results(self, query, n):
200 """Get a specified number of results for a query"""
206 while (50 * pagenum) < limit:
207 self.report_download_page(query, pagenum+1)
208 result_url = self._API_URL % (compat_urllib_parse.quote_plus(query), (50*pagenum)+1)
209 request = compat_urllib_request.Request(result_url)
211 data = compat_urllib_request.urlopen(request).read().decode('utf-8')
212 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
213 raise ExtractorError(u'Unable to download API page: %s' % compat_str(err))
214 api_response = json.loads(data)['data']
216 if not 'items' in api_response:
217 raise ExtractorError(u'[youtube] No video results')
219 new_ids = list(video['id'] for video in api_response['items'])
222 limit = min(n, api_response['totalItems'])
225 if len(video_ids) > n:
226 video_ids = video_ids[:n]
227 videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids]
228 return self.playlist_result(videos, query)
231 class GoogleSearchIE(SearchInfoExtractor):
232 """Information Extractor for Google Video search queries."""
233 _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
235 IE_NAME = u'video.google:search'
236 _SEARCH_KEY = 'gvsearch'
238 def _get_n_results(self, query, n):
239 """Get a specified number of results for a query"""
247 for pagenum in itertools.count(1):
248 result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
249 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
250 note='Downloading result page ' + str(pagenum))
252 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
257 res['entries'].append(e)
259 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
262 class YahooSearchIE(SearchInfoExtractor):
263 """Information Extractor for Yahoo! Video search queries."""
266 IE_NAME = u'screen.yahoo:search'
267 _SEARCH_KEY = 'yvsearch'
269 def _get_n_results(self, query, n):
270 """Get a specified number of results for a query"""
277 for pagenum in itertools.count(0):
278 result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
279 webpage = self._download_webpage(result_url, query,
280 note='Downloading results page '+str(pagenum+1))
281 info = json.loads(webpage)
283 results = info[u'results']
285 for (i, r) in enumerate(results):
286 if (pagenum * 30) +i >= n:
288 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
289 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
290 res['entries'].append(e)
291 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
297 class BlipTVUserIE(InfoExtractor):
298 """Information Extractor for blip.tv users."""
300 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
302 IE_NAME = u'blip.tv:user'
304 def _real_extract(self, url):
306 mobj = re.match(self._VALID_URL, url)
308 raise ExtractorError(u'Invalid URL: %s' % url)
310 username = mobj.group(1)
312 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
314 page = self._download_webpage(url, username, u'Downloading user page')
315 mobj = re.search(r'data-users-id="([^"]+)"', page)
316 page_base = page_base % mobj.group(1)
319 # Download video ids using BlipTV Ajax calls. Result size per
320 # query is limited (currently to 12 videos) so we need to query
321 # page by page until there are no video ids - it means we got
328 url = page_base + "&page=" + str(pagenum)
329 page = self._download_webpage(url, username,
330 u'Downloading video ids from page %d' % pagenum)
332 # Extract video identifiers
335 for mobj in re.finditer(r'href="/([^"]+)"', page):
336 if mobj.group(1) not in ids_in_page:
337 ids_in_page.append(unescapeHTML(mobj.group(1)))
339 video_ids.extend(ids_in_page)
341 # A little optimization - if current page is not
342 # "full", ie. does not contain PAGE_SIZE video ids then
343 # we can assume that this page is the last one - there
344 # are no more ids on further pages - no need to query
347 if len(ids_in_page) < self._PAGE_SIZE:
352 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
353 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
354 return [self.playlist_result(url_entries, playlist_title = username)]
357 class DepositFilesIE(InfoExtractor):
358 """Information extractor for depositfiles.com"""
360 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
362 def _real_extract(self, url):
363 file_id = url.split('/')[-1]
364 # Rebuild url in english locale
365 url = 'http://depositfiles.com/en/files/' + file_id
367 # Retrieve file webpage with 'Free download' button pressed
368 free_download_indication = { 'gateway_result' : '1' }
369 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
371 self.report_download_webpage(file_id)
372 webpage = compat_urllib_request.urlopen(request).read()
373 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
374 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
376 # Search for the real file URL
377 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
378 if (mobj is None) or (mobj.group(1) is None):
379 # Try to figure out reason of the error.
380 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
381 if (mobj is not None) and (mobj.group(1) is not None):
382 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
383 raise ExtractorError(u'%s' % restriction_message)
385 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
387 file_url = mobj.group(1)
388 file_extension = os.path.splitext(file_url)[1][1:]
390 # Search for file title
391 file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
394 'id': file_id.decode('utf-8'),
395 'url': file_url.decode('utf-8'),
399 'ext': file_extension.decode('utf-8'),
403 class FacebookIE(InfoExtractor):
404 """Information Extractor for Facebook"""
406 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
407 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
408 _NETRC_MACHINE = 'facebook'
409 IE_NAME = u'facebook'
411 def report_login(self):
412 """Report attempt to log in."""
413 self.to_screen(u'Logging in')
415 def _real_initialize(self):
416 if self._downloader is None:
421 downloader_params = self._downloader.params
423 # Attempt to use provided username and password or .netrc data
424 if downloader_params.get('username', None) is not None:
425 useremail = downloader_params['username']
426 password = downloader_params['password']
427 elif downloader_params.get('usenetrc', False):
429 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
434 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
435 except (IOError, netrc.NetrcParseError) as err:
436 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
439 if useremail is None:
448 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
451 login_results = compat_urllib_request.urlopen(request).read()
452 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
453 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
455 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
456 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
459 def _real_extract(self, url):
460 mobj = re.match(self._VALID_URL, url)
462 raise ExtractorError(u'Invalid URL: %s' % url)
463 video_id = mobj.group('ID')
465 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
466 webpage = self._download_webpage(url, video_id)
468 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
469 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
470 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
472 raise ExtractorError(u'Cannot parse data')
473 data = dict(json.loads(m.group(1)))
474 params_raw = compat_urllib_parse.unquote(data['params'])
475 params = json.loads(params_raw)
476 video_data = params['video_data'][0]
477 video_url = video_data.get('hd_src')
479 video_url = video_data['sd_src']
481 raise ExtractorError(u'Cannot find video URL')
482 video_duration = int(video_data['video_duration'])
483 thumbnail = video_data['thumbnail_src']
485 video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
490 'title': video_title,
493 'duration': video_duration,
494 'thumbnail': thumbnail,
499 class BlipTVIE(InfoExtractor):
500 """Information extractor for blip.tv"""
502 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
503 _URL_EXT = r'^.*\.([a-z0-9]+)$'
506 def report_direct_download(self, title):
507 """Report information extraction."""
508 self.to_screen(u'%s: Direct download detected' % title)
510 def _real_extract(self, url):
511 mobj = re.match(self._VALID_URL, url)
513 raise ExtractorError(u'Invalid URL: %s' % url)
515 # See https://github.com/rg3/youtube-dl/issues/857
516 api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
517 if api_mobj is not None:
518 url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
519 urlp = compat_urllib_parse_urlparse(url)
520 if urlp.path.startswith('/play/'):
521 request = compat_urllib_request.Request(url)
522 response = compat_urllib_request.urlopen(request)
523 redirecturl = response.geturl()
524 rurlp = compat_urllib_parse_urlparse(redirecturl)
525 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
526 url = 'http://blip.tv/a/a-' + file_id
527 return self._real_extract(url)
534 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
535 request = compat_urllib_request.Request(json_url)
536 request.add_header('User-Agent', 'iTunes/10.6.1')
537 self.report_extraction(mobj.group(1))
540 urlh = compat_urllib_request.urlopen(request)
541 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
542 basename = url.split('/')[-1]
543 title,ext = os.path.splitext(basename)
544 title = title.decode('UTF-8')
545 ext = ext.replace('.', '')
546 self.report_direct_download(title)
556 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
557 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
558 if info is None: # Regular URL
560 json_code_bytes = urlh.read()
561 json_code = json_code_bytes.decode('utf-8')
562 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
563 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
566 json_data = json.loads(json_code)
567 if 'Post' in json_data:
568 data = json_data['Post']
572 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
573 video_url = data['media']['url']
574 umobj = re.match(self._URL_EXT, video_url)
576 raise ValueError('Can not determine filename extension')
580 'id': data['item_id'],
582 'uploader': data['display_name'],
583 'upload_date': upload_date,
584 'title': data['title'],
586 'format': data['media']['mimeType'],
587 'thumbnail': data['thumbnailUrl'],
588 'description': data['description'],
589 'player_url': data['embedUrl'],
590 'user_agent': 'iTunes/10.6.1',
592 except (ValueError,KeyError) as err:
593 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
598 class MyVideoIE(InfoExtractor):
599 """Information Extractor for myvideo.de."""
601 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
604 # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
605 # Released into the Public Domain by Tristan Fischer on 2013-05-19
606 # https://github.com/rg3/youtube-dl/pull/842
607 def __rc4crypt(self,data, key):
609 box = list(range(256))
610 for i in list(range(256)):
611 x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
612 box[i], box[x] = box[x], box[i]
618 y = (y + box[x]) % 256
619 box[x], box[y] = box[y], box[x]
620 out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
624 return hashlib.md5(s).hexdigest().encode()
626 def _real_extract(self,url):
627 mobj = re.match(self._VALID_URL, url)
629 raise ExtractorError(u'invalid URL: %s' % url)
631 video_id = mobj.group(1)
634 b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
635 b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
636 b'TnpsbA0KTVRkbU1tSTRNdz09'
640 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
641 webpage = self._download_webpage(webpage_url, video_id)
643 mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
645 self.report_extraction(video_id)
646 video_url = mobj.group(1) + '.flv'
648 video_title = self._html_search_regex('<title>([^<]+)</title>',
651 video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
658 'title': video_title,
663 mobj = re.search('var flashvars={(.+?)}', webpage)
665 raise ExtractorError(u'Unable to extract video')
670 for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
671 if not a == '_encxml':
674 encxml = compat_urllib_parse.unquote(b)
675 if not params.get('domain'):
676 params['domain'] = 'www.myvideo.de'
677 xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
678 if 'flash_playertype=MTV' in xmldata_url:
679 self._downloader.report_warning(u'avoiding MTV player')
681 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
682 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
686 enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
687 enc_data_b = binascii.unhexlify(enc_data)
689 base64.b64decode(base64.b64decode(GK)) +
691 str(video_id).encode('utf-8')
694 dec_data = self.__rc4crypt(enc_data_b, sk)
697 self.report_extraction(video_id)
700 mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
702 video_url = compat_urllib_parse.unquote(mobj.group(1))
703 if 'myvideo2flash' in video_url:
704 self._downloader.report_warning(u'forcing RTMPT ...')
705 video_url = video_url.replace('rtmpe://', 'rtmpt://')
708 # extract non rtmp videos
709 mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
711 raise ExtractorError(u'unable to extract url')
712 video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
714 video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
715 video_file = compat_urllib_parse.unquote(video_file)
717 if not video_file.endswith('f4m'):
718 ppath, prefix = video_file.split('.')
719 video_playpath = '%s:%s' % (prefix, ppath)
720 video_hls_playlist = ''
723 video_hls_playlist = (
724 video_filepath + video_file
725 ).replace('.f4m', '.m3u8')
727 video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
728 video_swfobj = compat_urllib_parse.unquote(video_swfobj)
730 video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
739 'title': video_title,
741 'play_path': video_playpath,
742 'video_file': video_file,
743 'video_hls_playlist': video_hls_playlist,
744 'player_url': video_swfobj,
748 class ComedyCentralIE(InfoExtractor):
749 """Information extractor for The Daily Show and Colbert Report """
751 # urls can be abbreviations like :thedailyshow or :colbert
752 # urls for episodes like:
753 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
754 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
755 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
756 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
757 |(https?://)?(www\.)?
758 (?P<showname>thedailyshow|colbertnation)\.com/
759 (full-episodes/(?P<episode>.*)|
761 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
762 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
765 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
767 _video_extensions = {
775 _video_dimensions = {
785 def suitable(cls, url):
786 """Receives a URL and returns True if suitable for this IE."""
787 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
789 def _print_formats(self, formats):
790 print('Available formats:')
792 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
795 def _real_extract(self, url):
796 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
798 raise ExtractorError(u'Invalid URL: %s' % url)
800 if mobj.group('shortname'):
801 if mobj.group('shortname') in ('tds', 'thedailyshow'):
802 url = u'http://www.thedailyshow.com/full-episodes/'
804 url = u'http://www.colbertnation.com/full-episodes/'
805 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
806 assert mobj is not None
808 if mobj.group('clip'):
809 if mobj.group('showname') == 'thedailyshow':
810 epTitle = mobj.group('tdstitle')
812 epTitle = mobj.group('cntitle')
815 dlNewest = not mobj.group('episode')
817 epTitle = mobj.group('showname')
819 epTitle = mobj.group('episode')
821 self.report_extraction(epTitle)
822 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
824 url = htmlHandle.geturl()
825 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
827 raise ExtractorError(u'Invalid redirected URL: ' + url)
828 if mobj.group('episode') == '':
829 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
830 epTitle = mobj.group('episode')
832 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
834 if len(mMovieParams) == 0:
835 # The Colbert Report embeds the information in a without
836 # a URL prefix; so extract the alternate reference
837 # and then add the URL prefix manually.
839 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
840 if len(altMovieParams) == 0:
841 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
843 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
845 uri = mMovieParams[0][1]
846 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
847 indexXml = self._download_webpage(indexUrl, epTitle,
848 u'Downloading show index',
849 u'unable to download episode index')
853 idoc = xml.etree.ElementTree.fromstring(indexXml)
854 itemEls = idoc.findall('.//item')
855 for partNum,itemEl in enumerate(itemEls):
856 mediaId = itemEl.findall('./guid')[0].text
857 shortMediaId = mediaId.split(':')[-1]
858 showId = mediaId.split(':')[-2].replace('.com', '')
859 officialTitle = itemEl.findall('./title')[0].text
860 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
862 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
863 compat_urllib_parse.urlencode({'uri': mediaId}))
864 configXml = self._download_webpage(configUrl, epTitle,
865 u'Downloading configuration for %s' % shortMediaId)
867 cdoc = xml.etree.ElementTree.fromstring(configXml)
869 for rendition in cdoc.findall('.//rendition'):
870 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
874 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
877 if self._downloader.params.get('listformats', None):
878 self._print_formats([i[0] for i in turls])
881 # For now, just pick the highest bitrate
882 format,rtmp_video_url = turls[-1]
884 # Get the format arg from the arg stream
885 req_format = self._downloader.params.get('format', None)
887 # Select format if we can find one
890 format, rtmp_video_url = f, v
893 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
895 raise ExtractorError(u'Cannot transform RTMP url')
896 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
897 video_url = base + m.group('finalid')
899 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
904 'upload_date': officialDate,
909 'description': officialTitle,
916 class EscapistIE(InfoExtractor):
917 """Information extractor for The Escapist """
919 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
920 IE_NAME = u'escapist'
922 def _real_extract(self, url):
923 mobj = re.match(self._VALID_URL, url)
925 raise ExtractorError(u'Invalid URL: %s' % url)
926 showName = mobj.group('showname')
927 videoId = mobj.group('episode')
929 self.report_extraction(videoId)
930 webpage = self._download_webpage(url, videoId)
932 videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
933 webpage, u'description', fatal=False)
935 imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
936 webpage, u'thumbnail', fatal=False)
938 playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
939 webpage, u'player url')
941 title = self._html_search_regex('<meta name="title" content="([^"]*)"',
942 webpage, u'player url').split(' : ')[-1]
944 configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
945 configUrl = compat_urllib_parse.unquote(configUrl)
947 configJSON = self._download_webpage(configUrl, videoId,
948 u'Downloading configuration',
949 u'unable to download configuration')
951 # Technically, it's JavaScript, not JSON
952 configJSON = configJSON.replace("'", '"')
955 config = json.loads(configJSON)
956 except (ValueError,) as err:
957 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
959 playlist = config['playlist']
960 videoUrl = playlist[1]['url']
965 'uploader': showName,
970 'description': videoDesc,
971 'player_url': playerUrl,
976 class CollegeHumorIE(InfoExtractor):
977 """Information extractor for collegehumor.com"""
980 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
981 IE_NAME = u'collegehumor'
983 def report_manifest(self, video_id):
984 """Report information extraction."""
985 self.to_screen(u'%s: Downloading XML manifest' % video_id)
987 def _real_extract(self, url):
988 mobj = re.match(self._VALID_URL, url)
990 raise ExtractorError(u'Invalid URL: %s' % url)
991 video_id = mobj.group('videoid')
999 self.report_extraction(video_id)
1000 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
1002 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
1003 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1004 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1006 mdoc = xml.etree.ElementTree.fromstring(metaXml)
1008 videoNode = mdoc.findall('./video')[0]
1009 info['description'] = videoNode.findall('./description')[0].text
1010 info['title'] = videoNode.findall('./caption')[0].text
1011 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
1012 manifest_url = videoNode.findall('./file')[0].text
1014 raise ExtractorError(u'Invalid metadata XML file')
1016 manifest_url += '?hdcore=2.10.3'
1017 self.report_manifest(video_id)
1019 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
1020 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1021 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1023 adoc = xml.etree.ElementTree.fromstring(manifestXml)
1025 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
1026 node_id = media_node.attrib['url']
1027 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
1028 except IndexError as err:
1029 raise ExtractorError(u'Invalid manifest file')
1031 url_pr = compat_urllib_parse_urlparse(manifest_url)
1032 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
1039 class XVideosIE(InfoExtractor):
1040 """Information extractor for xvideos.com"""
1042 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
1043 IE_NAME = u'xvideos'
1045 def _real_extract(self, url):
1046 mobj = re.match(self._VALID_URL, url)
1048 raise ExtractorError(u'Invalid URL: %s' % url)
1049 video_id = mobj.group(1)
1051 webpage = self._download_webpage(url, video_id)
1053 self.report_extraction(video_id)
1056 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
1057 webpage, u'video URL'))
1060 video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
1063 # Extract video thumbnail
1064 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
1065 webpage, u'thumbnail', fatal=False)
1071 'upload_date': None,
1072 'title': video_title,
1074 'thumbnail': video_thumbnail,
1075 'description': None,
1081 class SoundcloudIE(InfoExtractor):
1082 """Information extractor for soundcloud.com
1083 To access the media, the uid of the song and a stream token
1084 must be extracted from the page source and the script must make
1085 a request to media.soundcloud.com/crossdomain.xml. Then
1086 the media can be grabbed by requesting from an url composed
1087 of the stream token and uid
1090 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
1091 IE_NAME = u'soundcloud'
1093 def report_resolve(self, video_id):
1094 """Report information extraction."""
1095 self.to_screen(u'%s: Resolving id' % video_id)
1097 def _real_extract(self, url):
1098 mobj = re.match(self._VALID_URL, url)
1100 raise ExtractorError(u'Invalid URL: %s' % url)
1102 # extract uploader (which is in the url)
1103 uploader = mobj.group(1)
1104 # extract simple title (uploader + slug of song title)
1105 slug_title = mobj.group(2)
1106 simple_title = uploader + u'-' + slug_title
1107 full_title = '%s/%s' % (uploader, slug_title)
1109 self.report_resolve(full_title)
1111 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
1112 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1113 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
1115 info = json.loads(info_json)
1116 video_id = info['id']
1117 self.report_extraction(full_title)
1119 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1120 stream_json = self._download_webpage(streams_url, full_title,
1121 u'Downloading stream definitions',
1122 u'unable to download stream definitions')
1124 streams = json.loads(stream_json)
1125 mediaURL = streams['http_mp3_128_url']
1126 upload_date = unified_strdate(info['created_at'])
1131 'uploader': info['user']['username'],
1132 'upload_date': upload_date,
1133 'title': info['title'],
1135 'description': info['description'],
1138 class SoundcloudSetIE(InfoExtractor):
1139 """Information extractor for soundcloud.com sets
1140 To access the media, the uid of the song and a stream token
1141 must be extracted from the page source and the script must make
1142 a request to media.soundcloud.com/crossdomain.xml. Then
1143 the media can be grabbed by requesting from an url composed
1144 of the stream token and uid
1147 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
1148 IE_NAME = u'soundcloud:set'
1150 def report_resolve(self, video_id):
1151 """Report information extraction."""
1152 self.to_screen(u'%s: Resolving id' % video_id)
1154 def _real_extract(self, url):
1155 mobj = re.match(self._VALID_URL, url)
1157 raise ExtractorError(u'Invalid URL: %s' % url)
1159 # extract uploader (which is in the url)
1160 uploader = mobj.group(1)
1161 # extract simple title (uploader + slug of song title)
1162 slug_title = mobj.group(2)
1163 simple_title = uploader + u'-' + slug_title
1164 full_title = '%s/sets/%s' % (uploader, slug_title)
1166 self.report_resolve(full_title)
1168 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
1169 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1170 info_json = self._download_webpage(resolv_url, full_title)
1173 info = json.loads(info_json)
1174 if 'errors' in info:
1175 for err in info['errors']:
1176 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
1179 self.report_extraction(full_title)
1180 for track in info['tracks']:
1181 video_id = track['id']
1183 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1184 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
1186 self.report_extraction(video_id)
1187 streams = json.loads(stream_json)
1188 mediaURL = streams['http_mp3_128_url']
1193 'uploader': track['user']['username'],
1194 'upload_date': unified_strdate(track['created_at']),
1195 'title': track['title'],
1197 'description': track['description'],
1202 class InfoQIE(InfoExtractor):
1203 """Information extractor for infoq.com"""
1204 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
1206 def _real_extract(self, url):
1207 mobj = re.match(self._VALID_URL, url)
1209 raise ExtractorError(u'Invalid URL: %s' % url)
1211 webpage = self._download_webpage(url, video_id=url)
1212 self.report_extraction(url)
1215 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
1217 raise ExtractorError(u'Unable to extract video url')
1218 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
1219 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
1222 video_title = self._search_regex(r'contentTitle = "(.*?)";',
1225 # Extract description
1226 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
1227 webpage, u'description', fatal=False)
1229 video_filename = video_url.split('/')[-1]
1230 video_id, extension = video_filename.split('.')
1236 'upload_date': None,
1237 'title': video_title,
1238 'ext': extension, # Extension is always(?) mp4, but seems to be flv
1240 'description': video_description,
1245 class MixcloudIE(InfoExtractor):
1246 """Information extractor for www.mixcloud.com"""
1248 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
1249 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
1250 IE_NAME = u'mixcloud'
1252 def report_download_json(self, file_id):
1253 """Report JSON download."""
1254 self.to_screen(u'Downloading json')
1256 def get_urls(self, jsonData, fmt, bitrate='best'):
1257 """Get urls from 'audio_formats' section in json"""
1260 bitrate_list = jsonData[fmt]
1261 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
1262 bitrate = max(bitrate_list) # select highest
1264 url_list = jsonData[fmt][bitrate]
1265 except TypeError: # we have no bitrate info.
1266 url_list = jsonData[fmt]
1269 def check_urls(self, url_list):
1270 """Returns 1st active url from list"""
1271 for url in url_list:
1273 compat_urllib_request.urlopen(url)
1275 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1280 def _print_formats(self, formats):
1281 print('Available formats:')
1282 for fmt in formats.keys():
1283 for b in formats[fmt]:
1285 ext = formats[fmt][b][0]
1286 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
1287 except TypeError: # we have no bitrate info
1288 ext = formats[fmt][0]
1289 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
1292 def _real_extract(self, url):
1293 mobj = re.match(self._VALID_URL, url)
1295 raise ExtractorError(u'Invalid URL: %s' % url)
1296 # extract uploader & filename from url
1297 uploader = mobj.group(1).decode('utf-8')
1298 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
1300 # construct API request
1301 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
1302 # retrieve .json file with links to files
1303 request = compat_urllib_request.Request(file_url)
1305 self.report_download_json(file_url)
1306 jsonData = compat_urllib_request.urlopen(request).read()
1307 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1308 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
1311 json_data = json.loads(jsonData)
1312 player_url = json_data['player_swf_url']
1313 formats = dict(json_data['audio_formats'])
1315 req_format = self._downloader.params.get('format', None)
1318 if self._downloader.params.get('listformats', None):
1319 self._print_formats(formats)
1322 if req_format is None or req_format == 'best':
1323 for format_param in formats.keys():
1324 url_list = self.get_urls(formats, format_param)
1326 file_url = self.check_urls(url_list)
1327 if file_url is not None:
1330 if req_format not in formats:
1331 raise ExtractorError(u'Format is not available')
1333 url_list = self.get_urls(formats, req_format)
1334 file_url = self.check_urls(url_list)
1335 format_param = req_format
1338 'id': file_id.decode('utf-8'),
1339 'url': file_url.decode('utf-8'),
1340 'uploader': uploader.decode('utf-8'),
1341 'upload_date': None,
1342 'title': json_data['name'],
1343 'ext': file_url.split('.')[-1].decode('utf-8'),
1344 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1345 'thumbnail': json_data['thumbnail_url'],
1346 'description': json_data['description'],
1347 'player_url': player_url.decode('utf-8'),
1350 class StanfordOpenClassroomIE(InfoExtractor):
1351 """Information extractor for Stanford's Open ClassRoom"""
1353 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
1354 IE_NAME = u'stanfordoc'
1356 def _real_extract(self, url):
1357 mobj = re.match(self._VALID_URL, url)
1359 raise ExtractorError(u'Invalid URL: %s' % url)
1361 if mobj.group('course') and mobj.group('video'): # A specific video
1362 course = mobj.group('course')
1363 video = mobj.group('video')
1365 'id': course + '_' + video,
1367 'upload_date': None,
1370 self.report_extraction(info['id'])
1371 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
1372 xmlUrl = baseUrl + video + '.xml'
1374 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
1375 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1376 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1377 mdoc = xml.etree.ElementTree.fromstring(metaXml)
1379 info['title'] = mdoc.findall('./title')[0].text
1380 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
1382 raise ExtractorError(u'Invalid metadata XML file')
1383 info['ext'] = info['url'].rpartition('.')[2]
1385 elif mobj.group('course'): # A course page
1386 course = mobj.group('course')
1391 'upload_date': None,
1394 coursepage = self._download_webpage(url, info['id'],
1395 note='Downloading course info page',
1396 errnote='Unable to download course info page')
1398 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
1400 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
1401 coursepage, u'description', fatal=False)
1403 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
1406 'type': 'reference',
1407 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
1411 for entry in info['list']:
1412 assert entry['type'] == 'reference'
1413 results += self.extract(entry['url'])
1417 'id': 'Stanford OpenClassroom',
1420 'upload_date': None,
1423 self.report_download_webpage(info['id'])
1424 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
1426 rootpage = compat_urllib_request.urlopen(rootURL).read()
1427 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1428 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
1430 info['title'] = info['id']
1432 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
1435 'type': 'reference',
1436 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
1441 for entry in info['list']:
1442 assert entry['type'] == 'reference'
1443 results += self.extract(entry['url'])
1446 class MTVIE(InfoExtractor):
1447 """Information extractor for MTV.com"""
1449 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
1452 def _real_extract(self, url):
1453 mobj = re.match(self._VALID_URL, url)
1455 raise ExtractorError(u'Invalid URL: %s' % url)
1456 if not mobj.group('proto'):
1457 url = 'http://' + url
1458 video_id = mobj.group('videoid')
1460 webpage = self._download_webpage(url, video_id)
1462 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
1463 webpage, u'song name', fatal=False)
1465 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
1468 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
1469 webpage, u'mtvn_uri', fatal=False)
1471 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
1472 webpage, u'content id', fatal=False)
1474 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
1475 self.report_extraction(video_id)
1476 request = compat_urllib_request.Request(videogen_url)
1478 metadataXml = compat_urllib_request.urlopen(request).read()
1479 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1480 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
1482 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
1483 renditions = mdoc.findall('.//rendition')
1485 # For now, always pick the highest quality.
1486 rendition = renditions[-1]
1489 _,_,ext = rendition.attrib['type'].partition('/')
1490 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
1491 video_url = rendition.find('./src').text
1493 raise ExtractorError('Invalid rendition field.')
1498 'uploader': performer,
1499 'upload_date': None,
1500 'title': video_title,
1508 class YoukuIE(InfoExtractor):
1509 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
1512 nowTime = int(time.time() * 1000)
1513 random1 = random.randint(1000,1998)
1514 random2 = random.randint(1000,9999)
1516 return "%d%d%d" %(nowTime,random1,random2)
1518 def _get_file_ID_mix_string(self, seed):
1520 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
1522 for i in range(len(source)):
1523 seed = (seed * 211 + 30031 ) % 65536
1524 index = math.floor(seed / 65536 * len(source) )
1525 mixed.append(source[int(index)])
1526 source.remove(source[int(index)])
1527 #return ''.join(mixed)
1530 def _get_file_id(self, fileId, seed):
1531 mixed = self._get_file_ID_mix_string(seed)
1532 ids = fileId.split('*')
1536 realId.append(mixed[int(ch)])
1537 return ''.join(realId)
1539 def _real_extract(self, url):
1540 mobj = re.match(self._VALID_URL, url)
1542 raise ExtractorError(u'Invalid URL: %s' % url)
1543 video_id = mobj.group('ID')
1545 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
1547 jsondata = self._download_webpage(info_url, video_id)
1549 self.report_extraction(video_id)
1551 config = json.loads(jsondata)
1553 video_title = config['data'][0]['title']
1554 seed = config['data'][0]['seed']
1556 format = self._downloader.params.get('format', None)
1557 supported_format = list(config['data'][0]['streamfileids'].keys())
1559 if format is None or format == 'best':
1560 if 'hd2' in supported_format:
1565 elif format == 'worst':
1573 fileid = config['data'][0]['streamfileids'][format]
1574 keys = [s['k'] for s in config['data'][0]['segs'][format]]
1575 except (UnicodeDecodeError, ValueError, KeyError):
1576 raise ExtractorError(u'Unable to extract info section')
1579 sid = self._gen_sid()
1580 fileid = self._get_file_id(fileid, seed)
1582 #column 8,9 of fileid represent the segment number
1583 #fileid[7:9] should be changed
1584 for index, key in enumerate(keys):
1586 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
1587 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
1590 'id': '%s_part%02d' % (video_id, index),
1591 'url': download_url,
1593 'upload_date': None,
1594 'title': video_title,
1597 files_info.append(info)
1602 class XNXXIE(InfoExtractor):
1603 """Information extractor for xnxx.com"""
1605 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
1607 VIDEO_URL_RE = r'flv_url=(.*?)&'
1608 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
1609 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
1611 def _real_extract(self, url):
1612 mobj = re.match(self._VALID_URL, url)
1614 raise ExtractorError(u'Invalid URL: %s' % url)
1615 video_id = mobj.group(1)
1617 # Get webpage content
1618 webpage = self._download_webpage(url, video_id)
1620 video_url = self._search_regex(self.VIDEO_URL_RE,
1621 webpage, u'video URL')
1622 video_url = compat_urllib_parse.unquote(video_url)
1624 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
1627 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
1628 webpage, u'thumbnail', fatal=False)
1634 'upload_date': None,
1635 'title': video_title,
1637 'thumbnail': video_thumbnail,
1638 'description': None,
1642 class GooglePlusIE(InfoExtractor):
1643 """Information extractor for plus.google.com."""
1645 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
1646 IE_NAME = u'plus.google'
1648 def _real_extract(self, url):
1649 # Extract id from URL
1650 mobj = re.match(self._VALID_URL, url)
1652 raise ExtractorError(u'Invalid URL: %s' % url)
1654 post_url = mobj.group(0)
1655 video_id = mobj.group(1)
1657 video_extension = 'flv'
1659 # Step 1, Retrieve post webpage to extract further information
1660 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
1662 self.report_extraction(video_id)
1664 # Extract update date
1665 upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
1666 webpage, u'upload date', fatal=False)
1668 # Convert timestring to a format suitable for filename
1669 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
1670 upload_date = upload_date.strftime('%Y%m%d')
1673 uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
1674 webpage, u'uploader', fatal=False)
1677 # Get the first line for title
1678 video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
1679 webpage, 'title', default=u'NA')
1681 # Step 2, Stimulate clicking the image box to launch video
1682 video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
1683 webpage, u'video page URL')
1684 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
1686 # Extract video links on video page
1687 """Extract video links of all sizes"""
1688 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
1689 mobj = re.findall(pattern, webpage)
1691 raise ExtractorError(u'Unable to extract video links')
1693 # Sort in resolution
1694 links = sorted(mobj)
1696 # Choose the lowest of the sort, i.e. highest resolution
1697 video_url = links[-1]
1698 # Only get the url. The resolution part in the tuple has no use anymore
1699 video_url = video_url[-1]
1700 # Treat escaped \u0026 style hex
1702 video_url = video_url.decode("unicode_escape")
1703 except AttributeError: # Python 3
1704 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
1710 'uploader': uploader,
1711 'upload_date': upload_date,
1712 'title': video_title,
1713 'ext': video_extension,
1716 class NBAIE(InfoExtractor):
1717 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
1720 def _real_extract(self, url):
1721 mobj = re.match(self._VALID_URL, url)
1723 raise ExtractorError(u'Invalid URL: %s' % url)
1725 video_id = mobj.group(1)
1727 webpage = self._download_webpage(url, video_id)
1729 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
1731 shortened_video_id = video_id.rpartition('/')[2]
1732 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
1733 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
1735 # It isn't there in the HTML it returns to us
1736 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
1738 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
1741 'id': shortened_video_id,
1745 # 'uploader_date': uploader_date,
1746 'description': description,
1750 class JustinTVIE(InfoExtractor):
1751 """Information extractor for justin.tv and twitch.tv"""
1752 # TODO: One broadcast may be split into multiple videos. The key
1753 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
1754 # starts at 1 and increases. Can we treat all parts as one video?
1756 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
1758 (?P<channelid>[^/]+)|
1759 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
1760 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
1764 _JUSTIN_PAGE_LIMIT = 100
1765 IE_NAME = u'justin.tv'
1767 def report_download_page(self, channel, offset):
1768 """Report attempt to download a single page of videos."""
1769 self.to_screen(u'%s: Downloading video information from %d to %d' %
1770 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
1772 # Return count of items, list of *valid* items
1773 def _parse_page(self, url, video_id):
1774 webpage = self._download_webpage(url, video_id,
1775 u'Downloading video info JSON',
1776 u'unable to download video info JSON')
1778 response = json.loads(webpage)
1779 if type(response) != list:
1780 error_text = response.get('error', 'unknown error')
1781 raise ExtractorError(u'Justin.tv API: %s' % error_text)
1783 for clip in response:
1784 video_url = clip['video_file_url']
1786 video_extension = os.path.splitext(video_url)[1][1:]
1787 video_date = re.sub('-', '', clip['start_time'][:10])
1788 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
1789 video_id = clip['id']
1790 video_title = clip.get('title', video_id)
1794 'title': video_title,
1795 'uploader': clip.get('channel_name', video_uploader_id),
1796 'uploader_id': video_uploader_id,
1797 'upload_date': video_date,
1798 'ext': video_extension,
1800 return (len(response), info)
1802 def _real_extract(self, url):
1803 mobj = re.match(self._VALID_URL, url)
1805 raise ExtractorError(u'invalid URL: %s' % url)
1807 api_base = 'http://api.justin.tv'
1809 if mobj.group('channelid'):
1811 video_id = mobj.group('channelid')
1812 api = api_base + '/channel/archives/%s.json' % video_id
1813 elif mobj.group('chapterid'):
1814 chapter_id = mobj.group('chapterid')
1816 webpage = self._download_webpage(url, chapter_id)
1817 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
1819 raise ExtractorError(u'Cannot find archive of a chapter')
1820 archive_id = m.group(1)
1822 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
1823 chapter_info_xml = self._download_webpage(api, chapter_id,
1824 note=u'Downloading chapter information',
1825 errnote=u'Chapter information download failed')
1826 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
1827 for a in doc.findall('.//archive'):
1828 if archive_id == a.find('./id').text:
1831 raise ExtractorError(u'Could not find chapter in chapter information')
1833 video_url = a.find('./video_file_url').text
1834 video_ext = video_url.rpartition('.')[2] or u'flv'
1836 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
1837 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
1838 note='Downloading chapter metadata',
1839 errnote='Download of chapter metadata failed')
1840 chapter_info = json.loads(chapter_info_json)
1842 bracket_start = int(doc.find('.//bracket_start').text)
1843 bracket_end = int(doc.find('.//bracket_end').text)
1845 # TODO determine start (and probably fix up file)
1846 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
1847 #video_url += u'?start=' + TODO:start_timestamp
1848 # bracket_start is 13290, but we want 51670615
1849 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
1850 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
1853 'id': u'c' + chapter_id,
1856 'title': chapter_info['title'],
1857 'thumbnail': chapter_info['preview'],
1858 'description': chapter_info['description'],
1859 'uploader': chapter_info['channel']['display_name'],
1860 'uploader_id': chapter_info['channel']['name'],
1864 video_id = mobj.group('videoid')
1865 api = api_base + '/broadcast/by_archive/%s.json' % video_id
1867 self.report_extraction(video_id)
1871 limit = self._JUSTIN_PAGE_LIMIT
1874 self.report_download_page(video_id, offset)
1875 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
1876 page_count, page_info = self._parse_page(page_url, video_id)
1877 info.extend(page_info)
1878 if not paged or page_count != limit:
1883 class FunnyOrDieIE(InfoExtractor):
1884 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
1886 def _real_extract(self, url):
1887 mobj = re.match(self._VALID_URL, url)
1889 raise ExtractorError(u'invalid URL: %s' % url)
1891 video_id = mobj.group('id')
1892 webpage = self._download_webpage(url, video_id)
1894 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
1895 webpage, u'video URL', flags=re.DOTALL)
1897 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
1898 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
1900 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1901 webpage, u'description', fatal=False, flags=re.DOTALL)
1908 'description': video_description,
1912 class SteamIE(InfoExtractor):
1913 _VALID_URL = r"""http://store\.steampowered\.com/
1915 (?P<urltype>video|app)/ #If the page is only for videos or for a game
1917 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
1919 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
1920 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
1923 def suitable(cls, url):
1924 """Receives a URL and returns True if suitable for this IE."""
1925 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1927 def _real_extract(self, url):
1928 m = re.match(self._VALID_URL, url, re.VERBOSE)
1929 gameID = m.group('gameID')
1931 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
1932 webpage = self._download_webpage(videourl, gameID)
1934 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
1935 videourl = self._AGECHECK_TEMPLATE % gameID
1936 self.report_age_confirmation()
1937 webpage = self._download_webpage(videourl, gameID)
1939 self.report_extraction(gameID)
1940 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
1941 webpage, 'game title')
1943 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
1944 mweb = re.finditer(urlRE, webpage)
1945 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
1946 titles = re.finditer(namesRE, webpage)
1947 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
1948 thumbs = re.finditer(thumbsRE, webpage)
1950 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
1951 video_id = vid.group('videoID')
1952 title = vtitle.group('videoName')
1953 video_url = vid.group('videoURL')
1954 video_thumb = thumb.group('thumbnail')
1956 raise ExtractorError(u'Cannot find video url for %s' % video_id)
1961 'title': unescapeHTML(title),
1962 'thumbnail': video_thumb
1965 return [self.playlist_result(videos, gameID, game_title)]
1967 class UstreamIE(InfoExtractor):
1968 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
1969 IE_NAME = u'ustream'
1971 def _real_extract(self, url):
1972 m = re.match(self._VALID_URL, url)
1973 video_id = m.group('videoID')
1975 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
1976 webpage = self._download_webpage(url, video_id)
1978 self.report_extraction(video_id)
1980 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
1983 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
1984 webpage, u'uploader', fatal=False, flags=re.DOTALL)
1986 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
1987 webpage, u'thumbnail', fatal=False)
1993 'title': video_title,
1994 'uploader': uploader,
1995 'thumbnail': thumbnail,
1999 class WorldStarHipHopIE(InfoExtractor):
2000 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
2001 IE_NAME = u'WorldStarHipHop'
2003 def _real_extract(self, url):
2004 m = re.match(self._VALID_URL, url)
2005 video_id = m.group('id')
2007 webpage_src = self._download_webpage(url, video_id)
2009 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
2010 webpage_src, u'video URL')
2012 if 'mp4' in video_url:
2017 video_title = self._html_search_regex(r"<title>(.*)</title>",
2018 webpage_src, u'title')
2020 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
2021 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
2022 webpage_src, u'thumbnail', fatal=False)
2025 _title = r"""candytitles.*>(.*)</span>"""
2026 mobj = re.search(_title, webpage_src)
2027 if mobj is not None:
2028 video_title = mobj.group(1)
2033 'title' : video_title,
2034 'thumbnail' : thumbnail,
2039 class RBMARadioIE(InfoExtractor):
2040 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
2042 def _real_extract(self, url):
2043 m = re.match(self._VALID_URL, url)
2044 video_id = m.group('videoID')
2046 webpage = self._download_webpage(url, video_id)
2048 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
2049 webpage, u'json data', flags=re.MULTILINE)
2052 data = json.loads(json_data)
2053 except ValueError as e:
2054 raise ExtractorError(u'Invalid JSON: ' + str(e))
2056 video_url = data['akamai_url'] + '&cbr=256'
2057 url_parts = compat_urllib_parse_urlparse(video_url)
2058 video_ext = url_parts.path.rpartition('.')[2]
2063 'title': data['title'],
2064 'description': data.get('teaser_text'),
2065 'location': data.get('country_of_origin'),
2066 'uploader': data.get('host', {}).get('name'),
2067 'uploader_id': data.get('host', {}).get('slug'),
2068 'thumbnail': data.get('image', {}).get('large_url_2x'),
2069 'duration': data.get('duration'),
2074 class YouPornIE(InfoExtractor):
2075 """Information extractor for youporn.com."""
2076 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
2078 def _print_formats(self, formats):
2079 """Print all available formats"""
2080 print(u'Available formats:')
2081 print(u'ext\t\tformat')
2082 print(u'---------------------------------')
2083 for format in formats:
2084 print(u'%s\t\t%s' % (format['ext'], format['format']))
2086 def _specific(self, req_format, formats):
2088 if(x["format"]==req_format):
2092 def _real_extract(self, url):
2093 mobj = re.match(self._VALID_URL, url)
2095 raise ExtractorError(u'Invalid URL: %s' % url)
2096 video_id = mobj.group('videoid')
2098 req = compat_urllib_request.Request(url)
2099 req.add_header('Cookie', 'age_verified=1')
2100 webpage = self._download_webpage(req, video_id)
2102 # Get JSON parameters
2103 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
2105 params = json.loads(json_params)
2107 raise ExtractorError(u'Invalid JSON')
2109 self.report_extraction(video_id)
2111 video_title = params['title']
2112 upload_date = unified_strdate(params['release_date_f'])
2113 video_description = params['description']
2114 video_uploader = params['submitted_by']
2115 thumbnail = params['thumbnails'][0]['image']
2117 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
2119 # Get all of the formats available
2120 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
2121 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
2122 webpage, u'download list').strip()
2124 # Get all of the links from the page
2125 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
2126 links = re.findall(LINK_RE, download_list_html)
2127 if(len(links) == 0):
2128 raise ExtractorError(u'ERROR: no known formats available for video')
2130 self.to_screen(u'Links found: %d' % len(links))
2135 # A link looks like this:
2136 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
2137 # A path looks like this:
2138 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
2139 video_url = unescapeHTML( link )
2140 path = compat_urllib_parse_urlparse( video_url ).path
2141 extension = os.path.splitext( path )[1][1:]
2142 format = path.split('/')[4].split('_')[:2]
2145 format = "-".join( format )
2146 # title = u'%s-%s-%s' % (video_title, size, bitrate)
2151 'uploader': video_uploader,
2152 'upload_date': upload_date,
2153 'title': video_title,
2156 'thumbnail': thumbnail,
2157 'description': video_description
2160 if self._downloader.params.get('listformats', None):
2161 self._print_formats(formats)
2164 req_format = self._downloader.params.get('format', None)
2165 self.to_screen(u'Format: %s' % req_format)
2167 if req_format is None or req_format == 'best':
2169 elif req_format == 'worst':
2170 return [formats[-1]]
2171 elif req_format in ('-1', 'all'):
2174 format = self._specific( req_format, formats )
2176 raise ExtractorError(u'Requested format not available')
2181 class PornotubeIE(InfoExtractor):
2182 """Information extractor for pornotube.com."""
2183 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
2185 def _real_extract(self, url):
2186 mobj = re.match(self._VALID_URL, url)
2188 raise ExtractorError(u'Invalid URL: %s' % url)
2190 video_id = mobj.group('videoid')
2191 video_title = mobj.group('title')
2193 # Get webpage content
2194 webpage = self._download_webpage(url, video_id)
2197 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
2198 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
2199 video_url = compat_urllib_parse.unquote(video_url)
2201 #Get the uploaded date
2202 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
2203 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
2204 if upload_date: upload_date = unified_strdate(upload_date)
2206 info = {'id': video_id,
2209 'upload_date': upload_date,
2210 'title': video_title,
2216 class YouJizzIE(InfoExtractor):
2217 """Information extractor for youjizz.com."""
2218 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
2220 def _real_extract(self, url):
2221 mobj = re.match(self._VALID_URL, url)
2223 raise ExtractorError(u'Invalid URL: %s' % url)
2225 video_id = mobj.group('videoid')
2227 # Get webpage content
2228 webpage = self._download_webpage(url, video_id)
2230 # Get the video title
2231 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
2232 webpage, u'title').strip()
2234 # Get the embed page
2235 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
2237 raise ExtractorError(u'ERROR: unable to extract embed page')
2239 embed_page_url = result.group(0).strip()
2240 video_id = result.group('videoid')
2242 webpage = self._download_webpage(embed_page_url, video_id)
2245 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
2246 webpage, u'video URL')
2248 info = {'id': video_id,
2250 'title': video_title,
2253 'player_url': embed_page_url}
2257 class EightTracksIE(InfoExtractor):
2259 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
2261 def _real_extract(self, url):
2262 mobj = re.match(self._VALID_URL, url)
2264 raise ExtractorError(u'Invalid URL: %s' % url)
2265 playlist_id = mobj.group('id')
2267 webpage = self._download_webpage(url, playlist_id)
2269 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
2270 data = json.loads(json_like)
2272 session = str(random.randint(0, 1000000000))
2274 track_count = data['tracks_count']
2275 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
2276 next_url = first_url
2278 for i in itertools.count():
2279 api_json = self._download_webpage(next_url, playlist_id,
2280 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
2281 errnote=u'Failed to download song information')
2282 api_data = json.loads(api_json)
2283 track_data = api_data[u'set']['track']
2285 'id': track_data['id'],
2286 'url': track_data['track_file_stream_url'],
2287 'title': track_data['performer'] + u' - ' + track_data['name'],
2288 'raw_title': track_data['name'],
2289 'uploader_id': data['user']['login'],
2293 if api_data['set']['at_last_track']:
2295 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
2298 class KeekIE(InfoExtractor):
2299 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
2302 def _real_extract(self, url):
2303 m = re.match(self._VALID_URL, url)
2304 video_id = m.group('videoID')
2306 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
2307 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
2308 webpage = self._download_webpage(url, video_id)
2310 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2313 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
2314 webpage, u'uploader', fatal=False)
2320 'title': video_title,
2321 'thumbnail': thumbnail,
2322 'uploader': uploader
2326 class TEDIE(InfoExtractor):
2327 _VALID_URL=r'''http://www\.ted\.com/
2329 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
2331 ((?P<type_talk>talks)) # We have a simple talk
2333 (/lang/(.*?))? # The url may contain the language
2334 /(?P<name>\w+) # Here goes the name and then ".html"
2338 def suitable(cls, url):
2339 """Receives a URL and returns True if suitable for this IE."""
2340 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2342 def _real_extract(self, url):
2343 m=re.match(self._VALID_URL, url, re.VERBOSE)
2344 if m.group('type_talk'):
2345 return [self._talk_info(url)]
2347 playlist_id=m.group('playlist_id')
2348 name=m.group('name')
2349 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
2350 return [self._playlist_videos_info(url,name,playlist_id)]
2352 def _playlist_videos_info(self,url,name,playlist_id=0):
2353 '''Returns the videos of the playlist'''
2355 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
2356 ([.\s]*?)data-playlist_item_id="(\d+)"
2357 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
2359 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
2360 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
2361 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
2362 m_names=re.finditer(video_name_RE,webpage)
2364 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
2365 webpage, 'playlist title')
2367 playlist_entries = []
2368 for m_video, m_name in zip(m_videos,m_names):
2369 video_id=m_video.group('video_id')
2370 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
2371 playlist_entries.append(self.url_result(talk_url, 'TED'))
2372 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
2374 def _talk_info(self, url, video_id=0):
2375 """Return the video for the talk in the url"""
2376 m = re.match(self._VALID_URL, url,re.VERBOSE)
2377 video_name = m.group('name')
2378 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
2379 self.report_extraction(video_name)
2380 # If the url includes the language we get the title translated
2381 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
2383 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
2384 webpage, 'json data')
2385 info = json.loads(json_data)
2386 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
2387 webpage, 'description', flags = re.DOTALL)
2389 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
2390 webpage, 'thumbnail')
2393 'url': info['htmlStreams'][-1]['file'],
2396 'thumbnail': thumbnail,
2397 'description': desc,
2401 class MySpassIE(InfoExtractor):
2402 _VALID_URL = r'http://www.myspass.de/.*'
2404 def _real_extract(self, url):
2405 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
2407 # video id is the last path element of the URL
2408 # usually there is a trailing slash, so also try the second but last
2409 url_path = compat_urllib_parse_urlparse(url).path
2410 url_parent_path, video_id = os.path.split(url_path)
2412 _, video_id = os.path.split(url_parent_path)
2415 metadata_url = META_DATA_URL_TEMPLATE % video_id
2416 metadata_text = self._download_webpage(metadata_url, video_id)
2417 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
2419 # extract values from metadata
2420 url_flv_el = metadata.find('url_flv')
2421 if url_flv_el is None:
2422 raise ExtractorError(u'Unable to extract download url')
2423 video_url = url_flv_el.text
2424 extension = os.path.splitext(video_url)[1][1:]
2425 title_el = metadata.find('title')
2426 if title_el is None:
2427 raise ExtractorError(u'Unable to extract title')
2428 title = title_el.text
2429 format_id_el = metadata.find('format_id')
2430 if format_id_el is None:
2433 format = format_id_el.text
2434 description_el = metadata.find('description')
2435 if description_el is not None:
2436 description = description_el.text
2439 imagePreview_el = metadata.find('imagePreview')
2440 if imagePreview_el is not None:
2441 thumbnail = imagePreview_el.text
2450 'thumbnail': thumbnail,
2451 'description': description
2455 class SpiegelIE(InfoExtractor):
2456 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
2458 def _real_extract(self, url):
2459 m = re.match(self._VALID_URL, url)
2460 video_id = m.group('videoID')
2462 webpage = self._download_webpage(url, video_id)
2464 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
2467 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
2468 xml_code = self._download_webpage(xml_url, video_id,
2469 note=u'Downloading XML', errnote=u'Failed to download XML')
2471 idoc = xml.etree.ElementTree.fromstring(xml_code)
2472 last_type = idoc[-1]
2473 filename = last_type.findall('./filename')[0].text
2474 duration = float(last_type.findall('./duration')[0].text)
2476 video_url = 'http://video2.spiegel.de/flash/' + filename
2477 video_ext = filename.rpartition('.')[2]
2482 'title': video_title,
2483 'duration': duration,
2487 class LiveLeakIE(InfoExtractor):
2489 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
2490 IE_NAME = u'liveleak'
2492 def _real_extract(self, url):
2493 mobj = re.match(self._VALID_URL, url)
2495 raise ExtractorError(u'Invalid URL: %s' % url)
2497 video_id = mobj.group('video_id')
2499 webpage = self._download_webpage(url, video_id)
2501 video_url = self._search_regex(r'file: "(.*?)",',
2502 webpage, u'video URL')
2504 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2505 webpage, u'title').replace('LiveLeak.com -', '').strip()
2507 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
2508 webpage, u'description', fatal=False)
2510 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
2511 webpage, u'uploader', fatal=False)
2517 'title': video_title,
2518 'description': video_description,
2519 'uploader': video_uploader
2526 class TumblrIE(InfoExtractor):
2527 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
2529 def _real_extract(self, url):
2530 m_url = re.match(self._VALID_URL, url)
2531 video_id = m_url.group('id')
2532 blog = m_url.group('blog_name')
2534 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
2535 webpage = self._download_webpage(url, video_id)
2537 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
2538 video = re.search(re_video, webpage)
2540 raise ExtractorError(u'Unable to extract video')
2541 video_url = video.group('video_url')
2542 ext = video.group('ext')
2544 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
2545 webpage, u'thumbnail', fatal=False) # We pick the first poster
2546 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
2548 # The only place where you can get a title, it's not complete,
2549 # but searching in other places doesn't work for all videos
2550 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
2551 webpage, u'title', flags=re.DOTALL)
2553 return [{'id': video_id,
2555 'title': video_title,
2556 'thumbnail': video_thumbnail,
2560 class BandcampIE(InfoExtractor):
2561 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
2563 def _real_extract(self, url):
2564 mobj = re.match(self._VALID_URL, url)
2565 title = mobj.group('title')
2566 webpage = self._download_webpage(url, title)
2567 # We get the link to the free download page
2568 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
2569 if m_download is None:
2570 raise ExtractorError(u'No free songs found')
2572 download_link = m_download.group(1)
2573 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
2574 webpage, re.MULTILINE|re.DOTALL).group('id')
2576 download_webpage = self._download_webpage(download_link, id,
2577 'Downloading free downloads page')
2578 # We get the dictionary of the track from some javascrip code
2579 info = re.search(r'items: (.*?),$',
2580 download_webpage, re.MULTILINE).group(1)
2581 info = json.loads(info)[0]
2582 # We pick mp3-320 for now, until format selection can be easily implemented.
2583 mp3_info = info[u'downloads'][u'mp3-320']
2584 # If we try to use this url it says the link has expired
2585 initial_url = mp3_info[u'url']
2586 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
2587 m_url = re.match(re_url, initial_url)
2588 #We build the url we will use to get the final track url
2589 # This url is build in Bandcamp in the script download_bunde_*.js
2590 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
2591 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
2592 # If we could correctly generate the .rand field the url would be
2593 #in the "download_url" key
2594 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
2596 track_info = {'id':id,
2597 'title' : info[u'title'],
2600 'thumbnail' : info[u'thumb_url'],
2601 'uploader' : info[u'artist']
2606 class RedTubeIE(InfoExtractor):
2607 """Information Extractor for redtube"""
2608 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
2610 def _real_extract(self,url):
2611 mobj = re.match(self._VALID_URL, url)
2613 raise ExtractorError(u'Invalid URL: %s' % url)
2615 video_id = mobj.group('id')
2616 video_extension = 'mp4'
2617 webpage = self._download_webpage(url, video_id)
2619 self.report_extraction(video_id)
2621 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
2622 webpage, u'video URL')
2624 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
2630 'ext': video_extension,
2631 'title': video_title,
2634 class InaIE(InfoExtractor):
2635 """Information Extractor for Ina.fr"""
2636 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
2638 def _real_extract(self,url):
2639 mobj = re.match(self._VALID_URL, url)
2641 video_id = mobj.group('id')
2642 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
2643 video_extension = 'mp4'
2644 webpage = self._download_webpage(mrss_url, video_id)
2646 self.report_extraction(video_id)
2648 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
2649 webpage, u'video URL')
2651 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
2657 'ext': video_extension,
2658 'title': video_title,
2661 class HowcastIE(InfoExtractor):
2662 """Information Extractor for Howcast.com"""
2663 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
2665 def _real_extract(self, url):
2666 mobj = re.match(self._VALID_URL, url)
2668 video_id = mobj.group('id')
2669 webpage_url = 'http://www.howcast.com/videos/' + video_id
2670 webpage = self._download_webpage(webpage_url, video_id)
2672 self.report_extraction(video_id)
2674 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
2675 webpage, u'video URL')
2677 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
2680 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
2681 webpage, u'description', fatal=False)
2683 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
2684 webpage, u'thumbnail', fatal=False)
2690 'title': video_title,
2691 'description': video_description,
2692 'thumbnail': thumbnail,
2695 class VineIE(InfoExtractor):
2696 """Information Extractor for Vine.co"""
2697 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
2699 def _real_extract(self, url):
2700 mobj = re.match(self._VALID_URL, url)
2702 video_id = mobj.group('id')
2703 webpage_url = 'https://vine.co/v/' + video_id
2704 webpage = self._download_webpage(webpage_url, video_id)
2706 self.report_extraction(video_id)
2708 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
2709 webpage, u'video URL')
2711 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2714 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
2715 webpage, u'thumbnail', fatal=False)
2717 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
2718 webpage, u'uploader', fatal=False, flags=re.DOTALL)
2724 'title': video_title,
2725 'thumbnail': thumbnail,
2726 'uploader': uploader,
2729 class FlickrIE(InfoExtractor):
2730 """Information Extractor for Flickr videos"""
2731 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
2733 def _real_extract(self, url):
2734 mobj = re.match(self._VALID_URL, url)
2736 video_id = mobj.group('id')
2737 video_uploader_id = mobj.group('uploader_id')
2738 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
2739 webpage = self._download_webpage(webpage_url, video_id)
2741 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
2743 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
2744 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
2746 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
2747 first_xml, u'node_id')
2749 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
2750 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
2752 self.report_extraction(video_id)
2754 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
2756 raise ExtractorError(u'Unable to extract video url')
2757 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
2759 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
2760 webpage, u'video title')
2762 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
2763 webpage, u'description', fatal=False)
2765 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
2766 webpage, u'thumbnail', fatal=False)
2772 'title': video_title,
2773 'description': video_description,
2774 'thumbnail': thumbnail,
2775 'uploader_id': video_uploader_id,
2778 class TeamcocoIE(InfoExtractor):
2779 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
2781 def _real_extract(self, url):
2782 mobj = re.match(self._VALID_URL, url)
2784 raise ExtractorError(u'Invalid URL: %s' % url)
2785 url_title = mobj.group('url_title')
2786 webpage = self._download_webpage(url, url_title)
2788 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
2789 webpage, u'video id')
2791 self.report_extraction(video_id)
2793 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2796 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
2797 webpage, u'thumbnail', fatal=False)
2799 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
2800 webpage, u'description', fatal=False)
2802 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
2803 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
2805 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
2812 'title': video_title,
2813 'thumbnail': thumbnail,
2814 'description': video_description,
2817 class XHamsterIE(InfoExtractor):
2818 """Information Extractor for xHamster"""
2819 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
2821 def _real_extract(self,url):
2822 mobj = re.match(self._VALID_URL, url)
2824 video_id = mobj.group('id')
2825 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
2826 webpage = self._download_webpage(mrss_url, video_id)
2828 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
2830 raise ExtractorError(u'Unable to extract media URL')
2831 if len(mobj.group('server')) == 0:
2832 video_url = compat_urllib_parse.unquote(mobj.group('file'))
2834 video_url = mobj.group('server')+'/key='+mobj.group('file')
2835 video_extension = video_url.split('.')[-1]
2837 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
2840 # Can't see the description anywhere in the UI
2841 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
2842 # webpage, u'description', fatal=False)
2843 # if video_description: video_description = unescapeHTML(video_description)
2845 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
2847 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
2849 video_upload_date = None
2850 self._downloader.report_warning(u'Unable to extract upload date')
2852 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
2853 webpage, u'uploader id', default=u'anonymous')
2855 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
2856 webpage, u'thumbnail', fatal=False)
2861 'ext': video_extension,
2862 'title': video_title,
2863 # 'description': video_description,
2864 'upload_date': video_upload_date,
2865 'uploader_id': video_uploader_id,
2866 'thumbnail': video_thumbnail
2869 class HypemIE(InfoExtractor):
2870 """Information Extractor for hypem"""
2871 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
2873 def _real_extract(self, url):
2874 mobj = re.match(self._VALID_URL, url)
2876 raise ExtractorError(u'Invalid URL: %s' % url)
2877 track_id = mobj.group(1)
2879 data = { 'ax': 1, 'ts': time.time() }
2880 data_encoded = compat_urllib_parse.urlencode(data)
2881 complete_url = url + "?" + data_encoded
2882 request = compat_urllib_request.Request(complete_url)
2883 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
2884 cookie = urlh.headers.get('Set-Cookie', '')
2886 self.report_extraction(track_id)
2888 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
2889 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
2891 track_list = json.loads(html_tracks)
2892 track = track_list[u'tracks'][0]
2894 raise ExtractorError(u'Hypemachine contained invalid JSON.')
2897 track_id = track[u"id"]
2898 artist = track[u"artist"]
2899 title = track[u"song"]
2901 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
2902 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
2903 request.add_header('cookie', cookie)
2904 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
2906 song_data = json.loads(song_data_json)
2908 raise ExtractorError(u'Hypemachine contained invalid JSON.')
2909 final_url = song_data[u"url"]
2919 class Vbox7IE(InfoExtractor):
2920 """Information Extractor for Vbox7"""
2921 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
2923 def _real_extract(self,url):
2924 mobj = re.match(self._VALID_URL, url)
2926 raise ExtractorError(u'Invalid URL: %s' % url)
2927 video_id = mobj.group(1)
2929 redirect_page, urlh = self._download_webpage_handle(url, video_id)
2930 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
2931 redirect_url = urlh.geturl() + new_location
2932 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
2934 title = self._html_search_regex(r'<title>(.*)</title>',
2935 webpage, u'title').split('/')[0].strip()
2938 info_url = "http://vbox7.com/play/magare.do"
2939 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
2940 info_request = compat_urllib_request.Request(info_url, data)
2941 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
2942 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
2943 if info_response is None:
2944 raise ExtractorError(u'Unable to extract the media url')
2945 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
2952 'thumbnail': thumbnail_url,
2955 class GametrailersIE(InfoExtractor):
2956 _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
2958 def _real_extract(self, url):
2959 mobj = re.match(self._VALID_URL, url)
2961 raise ExtractorError(u'Invalid URL: %s' % url)
2962 video_id = mobj.group('id')
2963 video_type = mobj.group('type')
2964 webpage = self._download_webpage(url, video_id)
2965 if video_type == 'full-episodes':
2966 mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
2968 mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
2969 mgid = self._search_regex(mgid_re, webpage, u'mgid')
2970 data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
2972 info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
2973 video_id, u'Downloading video info')
2974 links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
2975 video_id, u'Downloading video urls info')
2977 self.report_extraction(video_id)
2978 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
2979 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
2981 <url>(?P<thumb>.*?)</url>.*
2984 m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
2986 raise ExtractorError(u'Unable to extract video info')
2987 video_title = m_info.group('title')
2988 video_description = m_info.group('description')
2989 video_thumb = m_info.group('thumb')
2991 m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
2992 if m_urls is None or len(m_urls) == 0:
2993 raise ExtractError(u'Unable to extrat video url')
2994 # They are sorted from worst to best quality
2995 video_url = m_urls[-1].group('url')
2997 return {'url': video_url,
2999 'title': video_title,
3000 # Videos are actually flv not mp4
3002 'thumbnail': video_thumb,
3003 'description': video_description,
3006 def gen_extractors():
3007 """ Return a list of an instance of every supported extractor.
3008 The order does matter; the first extractor matched is the one handling the URL.
3011 YoutubePlaylistIE(),
3036 StanfordOpenClassroomIE(),
3046 WorldStarHipHopIE(),
3076 def get_info_extractor(ie_name):
3077 """Returns the info extractor class with the given ie_name"""
3078 return globals()[ie_name+'IE']