2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
24 from .extractor.common import InfoExtractor, SearchInfoExtractor
26 from .extractor.ard import ARDIE
27 from .extractor.arte import ArteTvIE
28 from .extractor.dailymotion import DailymotionIE
29 from .extractor.gametrailers import GametrailersIE
30 from .extractor.metacafe import MetacafeIE
31 from .extractor.statigram import StatigramIE
32 from .extractor.photobucket import PhotobucketIE
33 from .extractor.vimeo import VimeoIE
34 from .extractor.yahoo import YahooIE
35 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
36 from .extractor.zdf import ZDFIE
48 class GenericIE(InfoExtractor):
49 """Generic last-resort information extractor."""
54 def report_download_webpage(self, video_id):
55 """Report webpage download."""
56 if not self._downloader.params.get('test', False):
57 self._downloader.report_warning(u'Falling back on generic information extractor.')
58 super(GenericIE, self).report_download_webpage(video_id)
60 def report_following_redirect(self, new_url):
61 """Report information extraction."""
62 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
64 def _test_redirect(self, url):
65 """Check if it is a redirect, like url shorteners, in case return the new url."""
66 class HeadRequest(compat_urllib_request.Request):
70 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
72 Subclass the HTTPRedirectHandler to make it use our
73 HeadRequest also on the redirected URL
75 def redirect_request(self, req, fp, code, msg, headers, newurl):
76 if code in (301, 302, 303, 307):
77 newurl = newurl.replace(' ', '%20')
78 newheaders = dict((k,v) for k,v in req.headers.items()
79 if k.lower() not in ("content-length", "content-type"))
80 return HeadRequest(newurl,
82 origin_req_host=req.get_origin_req_host(),
85 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
87 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
89 Fallback to GET if HEAD is not allowed (405 HTTP error)
91 def http_error_405(self, req, fp, code, msg, headers):
95 newheaders = dict((k,v) for k,v in req.headers.items()
96 if k.lower() not in ("content-length", "content-type"))
97 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
99 origin_req_host=req.get_origin_req_host(),
103 opener = compat_urllib_request.OpenerDirector()
104 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
105 HTTPMethodFallback, HEADRedirectHandler,
106 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
107 opener.add_handler(handler())
109 response = opener.open(HeadRequest(url))
111 raise ExtractorError(u'Invalid URL protocol')
112 new_url = response.geturl()
117 self.report_following_redirect(new_url)
120 def _real_extract(self, url):
121 new_url = self._test_redirect(url)
122 if new_url: return [self.url_result(new_url)]
124 video_id = url.split('/')[-1]
126 webpage = self._download_webpage(url, video_id)
127 except ValueError as err:
128 # since this is the last-resort InfoExtractor, if
129 # this error is thrown, it'll be thrown here
130 raise ExtractorError(u'Invalid URL: %s' % url)
132 self.report_extraction(video_id)
133 # Start with something easy: JW Player in SWFObject
134 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
136 # Broaden the search a little bit
137 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
139 # Broaden the search a little bit: JWPlayer JS loader
140 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
142 # Try to find twitter cards info
143 mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
145 # We look for Open Graph info:
146 # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
147 m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
148 # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
149 if m_video_type is not None:
150 mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
152 raise ExtractorError(u'Invalid URL: %s' % url)
154 # It's possible that one of the regexes
155 # matched, but returned an empty group:
156 if mobj.group(1) is None:
157 raise ExtractorError(u'Invalid URL: %s' % url)
159 video_url = compat_urllib_parse.unquote(mobj.group(1))
160 video_id = os.path.basename(video_url)
162 # here's a fun little line of code for you:
163 video_extension = os.path.splitext(video_id)[1][1:]
164 video_id = os.path.splitext(video_id)[0]
166 # it's tempting to parse this further, but you would
167 # have to take into account all the variations like
168 # Video Title - Site Name
169 # Site Name | Video Title
170 # Video Title - Tagline | Site Name
171 # and so on and so forth; it's just not practical
172 video_title = self._html_search_regex(r'<title>(.*)</title>',
173 webpage, u'video title')
175 # video uploader is domain name
176 video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
177 url, u'video uploader')
182 'uploader': video_uploader,
184 'title': video_title,
185 'ext': video_extension,
190 class GoogleSearchIE(SearchInfoExtractor):
191 """Information Extractor for Google Video search queries."""
192 _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
194 IE_NAME = u'video.google:search'
195 _SEARCH_KEY = 'gvsearch'
197 def _get_n_results(self, query, n):
198 """Get a specified number of results for a query"""
206 for pagenum in itertools.count(1):
207 result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
208 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
209 note='Downloading result page ' + str(pagenum))
211 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
216 res['entries'].append(e)
218 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
221 class YahooSearchIE(SearchInfoExtractor):
222 """Information Extractor for Yahoo! Video search queries."""
225 IE_NAME = u'screen.yahoo:search'
226 _SEARCH_KEY = 'yvsearch'
228 def _get_n_results(self, query, n):
229 """Get a specified number of results for a query"""
236 for pagenum in itertools.count(0):
237 result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
238 webpage = self._download_webpage(result_url, query,
239 note='Downloading results page '+str(pagenum+1))
240 info = json.loads(webpage)
242 results = info[u'results']
244 for (i, r) in enumerate(results):
245 if (pagenum * 30) +i >= n:
247 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
248 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
249 res['entries'].append(e)
250 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
256 class BlipTVUserIE(InfoExtractor):
257 """Information Extractor for blip.tv users."""
259 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
261 IE_NAME = u'blip.tv:user'
263 def _real_extract(self, url):
265 mobj = re.match(self._VALID_URL, url)
267 raise ExtractorError(u'Invalid URL: %s' % url)
269 username = mobj.group(1)
271 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
273 page = self._download_webpage(url, username, u'Downloading user page')
274 mobj = re.search(r'data-users-id="([^"]+)"', page)
275 page_base = page_base % mobj.group(1)
278 # Download video ids using BlipTV Ajax calls. Result size per
279 # query is limited (currently to 12 videos) so we need to query
280 # page by page until there are no video ids - it means we got
287 url = page_base + "&page=" + str(pagenum)
288 page = self._download_webpage(url, username,
289 u'Downloading video ids from page %d' % pagenum)
291 # Extract video identifiers
294 for mobj in re.finditer(r'href="/([^"]+)"', page):
295 if mobj.group(1) not in ids_in_page:
296 ids_in_page.append(unescapeHTML(mobj.group(1)))
298 video_ids.extend(ids_in_page)
300 # A little optimization - if current page is not
301 # "full", ie. does not contain PAGE_SIZE video ids then
302 # we can assume that this page is the last one - there
303 # are no more ids on further pages - no need to query
306 if len(ids_in_page) < self._PAGE_SIZE:
311 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
312 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
313 return [self.playlist_result(url_entries, playlist_title = username)]
316 class DepositFilesIE(InfoExtractor):
317 """Information extractor for depositfiles.com"""
319 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
321 def _real_extract(self, url):
322 file_id = url.split('/')[-1]
323 # Rebuild url in english locale
324 url = 'http://depositfiles.com/en/files/' + file_id
326 # Retrieve file webpage with 'Free download' button pressed
327 free_download_indication = { 'gateway_result' : '1' }
328 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
330 self.report_download_webpage(file_id)
331 webpage = compat_urllib_request.urlopen(request).read()
332 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
333 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
335 # Search for the real file URL
336 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
337 if (mobj is None) or (mobj.group(1) is None):
338 # Try to figure out reason of the error.
339 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
340 if (mobj is not None) and (mobj.group(1) is not None):
341 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
342 raise ExtractorError(u'%s' % restriction_message)
344 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
346 file_url = mobj.group(1)
347 file_extension = os.path.splitext(file_url)[1][1:]
349 # Search for file title
350 file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
353 'id': file_id.decode('utf-8'),
354 'url': file_url.decode('utf-8'),
358 'ext': file_extension.decode('utf-8'),
362 class FacebookIE(InfoExtractor):
363 """Information Extractor for Facebook"""
365 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
366 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
367 _NETRC_MACHINE = 'facebook'
368 IE_NAME = u'facebook'
370 def report_login(self):
371 """Report attempt to log in."""
372 self.to_screen(u'Logging in')
374 def _real_initialize(self):
375 if self._downloader is None:
380 downloader_params = self._downloader.params
382 # Attempt to use provided username and password or .netrc data
383 if downloader_params.get('username', None) is not None:
384 useremail = downloader_params['username']
385 password = downloader_params['password']
386 elif downloader_params.get('usenetrc', False):
388 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
393 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
394 except (IOError, netrc.NetrcParseError) as err:
395 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
398 if useremail is None:
407 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
410 login_results = compat_urllib_request.urlopen(request).read()
411 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
412 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
414 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
415 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
418 def _real_extract(self, url):
419 mobj = re.match(self._VALID_URL, url)
421 raise ExtractorError(u'Invalid URL: %s' % url)
422 video_id = mobj.group('ID')
424 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
425 webpage = self._download_webpage(url, video_id)
427 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
428 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
429 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
431 raise ExtractorError(u'Cannot parse data')
432 data = dict(json.loads(m.group(1)))
433 params_raw = compat_urllib_parse.unquote(data['params'])
434 params = json.loads(params_raw)
435 video_data = params['video_data'][0]
436 video_url = video_data.get('hd_src')
438 video_url = video_data['sd_src']
440 raise ExtractorError(u'Cannot find video URL')
441 video_duration = int(video_data['video_duration'])
442 thumbnail = video_data['thumbnail_src']
444 video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
449 'title': video_title,
452 'duration': video_duration,
453 'thumbnail': thumbnail,
458 class BlipTVIE(InfoExtractor):
459 """Information extractor for blip.tv"""
461 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
462 _URL_EXT = r'^.*\.([a-z0-9]+)$'
465 def report_direct_download(self, title):
466 """Report information extraction."""
467 self.to_screen(u'%s: Direct download detected' % title)
469 def _real_extract(self, url):
470 mobj = re.match(self._VALID_URL, url)
472 raise ExtractorError(u'Invalid URL: %s' % url)
474 # See https://github.com/rg3/youtube-dl/issues/857
475 api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
476 if api_mobj is not None:
477 url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
478 urlp = compat_urllib_parse_urlparse(url)
479 if urlp.path.startswith('/play/'):
480 request = compat_urllib_request.Request(url)
481 response = compat_urllib_request.urlopen(request)
482 redirecturl = response.geturl()
483 rurlp = compat_urllib_parse_urlparse(redirecturl)
484 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
485 url = 'http://blip.tv/a/a-' + file_id
486 return self._real_extract(url)
493 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
494 request = compat_urllib_request.Request(json_url)
495 request.add_header('User-Agent', 'iTunes/10.6.1')
496 self.report_extraction(mobj.group(1))
499 urlh = compat_urllib_request.urlopen(request)
500 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
501 basename = url.split('/')[-1]
502 title,ext = os.path.splitext(basename)
503 title = title.decode('UTF-8')
504 ext = ext.replace('.', '')
505 self.report_direct_download(title)
515 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
516 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
517 if info is None: # Regular URL
519 json_code_bytes = urlh.read()
520 json_code = json_code_bytes.decode('utf-8')
521 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
522 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
525 json_data = json.loads(json_code)
526 if 'Post' in json_data:
527 data = json_data['Post']
531 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
532 video_url = data['media']['url']
533 umobj = re.match(self._URL_EXT, video_url)
535 raise ValueError('Can not determine filename extension')
539 'id': data['item_id'],
541 'uploader': data['display_name'],
542 'upload_date': upload_date,
543 'title': data['title'],
545 'format': data['media']['mimeType'],
546 'thumbnail': data['thumbnailUrl'],
547 'description': data['description'],
548 'player_url': data['embedUrl'],
549 'user_agent': 'iTunes/10.6.1',
551 except (ValueError,KeyError) as err:
552 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
557 class MyVideoIE(InfoExtractor):
558 """Information Extractor for myvideo.de."""
560 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
563 # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
564 # Released into the Public Domain by Tristan Fischer on 2013-05-19
565 # https://github.com/rg3/youtube-dl/pull/842
566 def __rc4crypt(self,data, key):
568 box = list(range(256))
569 for i in list(range(256)):
570 x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
571 box[i], box[x] = box[x], box[i]
577 y = (y + box[x]) % 256
578 box[x], box[y] = box[y], box[x]
579 out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
583 return hashlib.md5(s).hexdigest().encode()
585 def _real_extract(self,url):
586 mobj = re.match(self._VALID_URL, url)
588 raise ExtractorError(u'invalid URL: %s' % url)
590 video_id = mobj.group(1)
593 b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
594 b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
595 b'TnpsbA0KTVRkbU1tSTRNdz09'
599 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
600 webpage = self._download_webpage(webpage_url, video_id)
602 mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
604 self.report_extraction(video_id)
605 video_url = mobj.group(1) + '.flv'
607 video_title = self._html_search_regex('<title>([^<]+)</title>',
610 video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
617 'title': video_title,
622 mobj = re.search('var flashvars={(.+?)}', webpage)
624 raise ExtractorError(u'Unable to extract video')
629 for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
630 if not a == '_encxml':
633 encxml = compat_urllib_parse.unquote(b)
634 if not params.get('domain'):
635 params['domain'] = 'www.myvideo.de'
636 xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
637 if 'flash_playertype=MTV' in xmldata_url:
638 self._downloader.report_warning(u'avoiding MTV player')
640 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
641 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
645 enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
646 enc_data_b = binascii.unhexlify(enc_data)
648 base64.b64decode(base64.b64decode(GK)) +
650 str(video_id).encode('utf-8')
653 dec_data = self.__rc4crypt(enc_data_b, sk)
656 self.report_extraction(video_id)
659 mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
661 video_url = compat_urllib_parse.unquote(mobj.group(1))
662 if 'myvideo2flash' in video_url:
663 self._downloader.report_warning(u'forcing RTMPT ...')
664 video_url = video_url.replace('rtmpe://', 'rtmpt://')
667 # extract non rtmp videos
668 mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
670 raise ExtractorError(u'unable to extract url')
671 video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
673 video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
674 video_file = compat_urllib_parse.unquote(video_file)
676 if not video_file.endswith('f4m'):
677 ppath, prefix = video_file.split('.')
678 video_playpath = '%s:%s' % (prefix, ppath)
679 video_hls_playlist = ''
682 video_hls_playlist = (
683 video_filepath + video_file
684 ).replace('.f4m', '.m3u8')
686 video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
687 video_swfobj = compat_urllib_parse.unquote(video_swfobj)
689 video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
698 'title': video_title,
700 'play_path': video_playpath,
701 'video_file': video_file,
702 'video_hls_playlist': video_hls_playlist,
703 'player_url': video_swfobj,
707 class ComedyCentralIE(InfoExtractor):
708 """Information extractor for The Daily Show and Colbert Report """
710 # urls can be abbreviations like :thedailyshow or :colbert
711 # urls for episodes like:
712 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
713 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
714 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
715 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
716 |(https?://)?(www\.)?
717 (?P<showname>thedailyshow|colbertnation)\.com/
718 (full-episodes/(?P<episode>.*)|
720 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
721 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
724 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
726 _video_extensions = {
734 _video_dimensions = {
744 def suitable(cls, url):
745 """Receives a URL and returns True if suitable for this IE."""
746 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
748 def _print_formats(self, formats):
749 print('Available formats:')
751 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
754 def _real_extract(self, url):
755 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
757 raise ExtractorError(u'Invalid URL: %s' % url)
759 if mobj.group('shortname'):
760 if mobj.group('shortname') in ('tds', 'thedailyshow'):
761 url = u'http://www.thedailyshow.com/full-episodes/'
763 url = u'http://www.colbertnation.com/full-episodes/'
764 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
765 assert mobj is not None
767 if mobj.group('clip'):
768 if mobj.group('showname') == 'thedailyshow':
769 epTitle = mobj.group('tdstitle')
771 epTitle = mobj.group('cntitle')
774 dlNewest = not mobj.group('episode')
776 epTitle = mobj.group('showname')
778 epTitle = mobj.group('episode')
780 self.report_extraction(epTitle)
781 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
783 url = htmlHandle.geturl()
784 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
786 raise ExtractorError(u'Invalid redirected URL: ' + url)
787 if mobj.group('episode') == '':
788 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
789 epTitle = mobj.group('episode')
791 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
793 if len(mMovieParams) == 0:
794 # The Colbert Report embeds the information in a without
795 # a URL prefix; so extract the alternate reference
796 # and then add the URL prefix manually.
798 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
799 if len(altMovieParams) == 0:
800 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
802 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
804 uri = mMovieParams[0][1]
805 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
806 indexXml = self._download_webpage(indexUrl, epTitle,
807 u'Downloading show index',
808 u'unable to download episode index')
812 idoc = xml.etree.ElementTree.fromstring(indexXml)
813 itemEls = idoc.findall('.//item')
814 for partNum,itemEl in enumerate(itemEls):
815 mediaId = itemEl.findall('./guid')[0].text
816 shortMediaId = mediaId.split(':')[-1]
817 showId = mediaId.split(':')[-2].replace('.com', '')
818 officialTitle = itemEl.findall('./title')[0].text
819 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
821 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
822 compat_urllib_parse.urlencode({'uri': mediaId}))
823 configXml = self._download_webpage(configUrl, epTitle,
824 u'Downloading configuration for %s' % shortMediaId)
826 cdoc = xml.etree.ElementTree.fromstring(configXml)
828 for rendition in cdoc.findall('.//rendition'):
829 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
833 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
836 if self._downloader.params.get('listformats', None):
837 self._print_formats([i[0] for i in turls])
840 # For now, just pick the highest bitrate
841 format,rtmp_video_url = turls[-1]
843 # Get the format arg from the arg stream
844 req_format = self._downloader.params.get('format', None)
846 # Select format if we can find one
849 format, rtmp_video_url = f, v
852 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
854 raise ExtractorError(u'Cannot transform RTMP url')
855 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
856 video_url = base + m.group('finalid')
858 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
863 'upload_date': officialDate,
868 'description': officialTitle,
875 class EscapistIE(InfoExtractor):
876 """Information extractor for The Escapist """
878 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
879 IE_NAME = u'escapist'
881 def _real_extract(self, url):
882 mobj = re.match(self._VALID_URL, url)
884 raise ExtractorError(u'Invalid URL: %s' % url)
885 showName = mobj.group('showname')
886 videoId = mobj.group('episode')
888 self.report_extraction(videoId)
889 webpage = self._download_webpage(url, videoId)
891 videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
892 webpage, u'description', fatal=False)
894 imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
895 webpage, u'thumbnail', fatal=False)
897 playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
898 webpage, u'player url')
900 title = self._html_search_regex('<meta name="title" content="([^"]*)"',
901 webpage, u'player url').split(' : ')[-1]
903 configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
904 configUrl = compat_urllib_parse.unquote(configUrl)
906 configJSON = self._download_webpage(configUrl, videoId,
907 u'Downloading configuration',
908 u'unable to download configuration')
910 # Technically, it's JavaScript, not JSON
911 configJSON = configJSON.replace("'", '"')
914 config = json.loads(configJSON)
915 except (ValueError,) as err:
916 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
918 playlist = config['playlist']
919 videoUrl = playlist[1]['url']
924 'uploader': showName,
929 'description': videoDesc,
930 'player_url': playerUrl,
935 class CollegeHumorIE(InfoExtractor):
936 """Information extractor for collegehumor.com"""
939 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
940 IE_NAME = u'collegehumor'
942 def report_manifest(self, video_id):
943 """Report information extraction."""
944 self.to_screen(u'%s: Downloading XML manifest' % video_id)
946 def _real_extract(self, url):
947 mobj = re.match(self._VALID_URL, url)
949 raise ExtractorError(u'Invalid URL: %s' % url)
950 video_id = mobj.group('videoid')
958 self.report_extraction(video_id)
959 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
961 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
962 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
963 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
965 mdoc = xml.etree.ElementTree.fromstring(metaXml)
967 videoNode = mdoc.findall('./video')[0]
968 info['description'] = videoNode.findall('./description')[0].text
969 info['title'] = videoNode.findall('./caption')[0].text
970 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
971 manifest_url = videoNode.findall('./file')[0].text
973 raise ExtractorError(u'Invalid metadata XML file')
975 manifest_url += '?hdcore=2.10.3'
976 self.report_manifest(video_id)
978 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
979 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
980 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
982 adoc = xml.etree.ElementTree.fromstring(manifestXml)
984 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
985 node_id = media_node.attrib['url']
986 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
987 except IndexError as err:
988 raise ExtractorError(u'Invalid manifest file')
990 url_pr = compat_urllib_parse_urlparse(manifest_url)
991 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
998 class XVideosIE(InfoExtractor):
999 """Information extractor for xvideos.com"""
1001 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
1002 IE_NAME = u'xvideos'
1004 def _real_extract(self, url):
1005 mobj = re.match(self._VALID_URL, url)
1007 raise ExtractorError(u'Invalid URL: %s' % url)
1008 video_id = mobj.group(1)
1010 webpage = self._download_webpage(url, video_id)
1012 self.report_extraction(video_id)
1015 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
1016 webpage, u'video URL'))
1019 video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
1022 # Extract video thumbnail
1023 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
1024 webpage, u'thumbnail', fatal=False)
1030 'upload_date': None,
1031 'title': video_title,
1033 'thumbnail': video_thumbnail,
1034 'description': None,
1040 class SoundcloudIE(InfoExtractor):
1041 """Information extractor for soundcloud.com
1042 To access the media, the uid of the song and a stream token
1043 must be extracted from the page source and the script must make
1044 a request to media.soundcloud.com/crossdomain.xml. Then
1045 the media can be grabbed by requesting from an url composed
1046 of the stream token and uid
1049 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
1050 IE_NAME = u'soundcloud'
1052 def report_resolve(self, video_id):
1053 """Report information extraction."""
1054 self.to_screen(u'%s: Resolving id' % video_id)
1056 def _real_extract(self, url):
1057 mobj = re.match(self._VALID_URL, url)
1059 raise ExtractorError(u'Invalid URL: %s' % url)
1061 # extract uploader (which is in the url)
1062 uploader = mobj.group(1)
1063 # extract simple title (uploader + slug of song title)
1064 slug_title = mobj.group(2)
1065 simple_title = uploader + u'-' + slug_title
1066 full_title = '%s/%s' % (uploader, slug_title)
1068 self.report_resolve(full_title)
1070 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
1071 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1072 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
1074 info = json.loads(info_json)
1075 video_id = info['id']
1076 self.report_extraction(full_title)
1078 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1079 stream_json = self._download_webpage(streams_url, full_title,
1080 u'Downloading stream definitions',
1081 u'unable to download stream definitions')
1083 streams = json.loads(stream_json)
1084 mediaURL = streams['http_mp3_128_url']
1085 upload_date = unified_strdate(info['created_at'])
1090 'uploader': info['user']['username'],
1091 'upload_date': upload_date,
1092 'title': info['title'],
1094 'description': info['description'],
1097 class SoundcloudSetIE(InfoExtractor):
1098 """Information extractor for soundcloud.com sets
1099 To access the media, the uid of the song and a stream token
1100 must be extracted from the page source and the script must make
1101 a request to media.soundcloud.com/crossdomain.xml. Then
1102 the media can be grabbed by requesting from an url composed
1103 of the stream token and uid
1106 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
1107 IE_NAME = u'soundcloud:set'
1109 def report_resolve(self, video_id):
1110 """Report information extraction."""
1111 self.to_screen(u'%s: Resolving id' % video_id)
1113 def _real_extract(self, url):
1114 mobj = re.match(self._VALID_URL, url)
1116 raise ExtractorError(u'Invalid URL: %s' % url)
1118 # extract uploader (which is in the url)
1119 uploader = mobj.group(1)
1120 # extract simple title (uploader + slug of song title)
1121 slug_title = mobj.group(2)
1122 simple_title = uploader + u'-' + slug_title
1123 full_title = '%s/sets/%s' % (uploader, slug_title)
1125 self.report_resolve(full_title)
1127 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
1128 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1129 info_json = self._download_webpage(resolv_url, full_title)
1132 info = json.loads(info_json)
1133 if 'errors' in info:
1134 for err in info['errors']:
1135 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
1138 self.report_extraction(full_title)
1139 for track in info['tracks']:
1140 video_id = track['id']
1142 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1143 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
1145 self.report_extraction(video_id)
1146 streams = json.loads(stream_json)
1147 mediaURL = streams['http_mp3_128_url']
1152 'uploader': track['user']['username'],
1153 'upload_date': unified_strdate(track['created_at']),
1154 'title': track['title'],
1156 'description': track['description'],
1161 class InfoQIE(InfoExtractor):
1162 """Information extractor for infoq.com"""
1163 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
1165 def _real_extract(self, url):
1166 mobj = re.match(self._VALID_URL, url)
1168 raise ExtractorError(u'Invalid URL: %s' % url)
1170 webpage = self._download_webpage(url, video_id=url)
1171 self.report_extraction(url)
1174 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
1176 raise ExtractorError(u'Unable to extract video url')
1177 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
1178 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
1181 video_title = self._search_regex(r'contentTitle = "(.*?)";',
1184 # Extract description
1185 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
1186 webpage, u'description', fatal=False)
1188 video_filename = video_url.split('/')[-1]
1189 video_id, extension = video_filename.split('.')
1195 'upload_date': None,
1196 'title': video_title,
1197 'ext': extension, # Extension is always(?) mp4, but seems to be flv
1199 'description': video_description,
1204 class MixcloudIE(InfoExtractor):
1205 """Information extractor for www.mixcloud.com"""
1207 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
1208 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
1209 IE_NAME = u'mixcloud'
1211 def report_download_json(self, file_id):
1212 """Report JSON download."""
1213 self.to_screen(u'Downloading json')
1215 def get_urls(self, jsonData, fmt, bitrate='best'):
1216 """Get urls from 'audio_formats' section in json"""
1219 bitrate_list = jsonData[fmt]
1220 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
1221 bitrate = max(bitrate_list) # select highest
1223 url_list = jsonData[fmt][bitrate]
1224 except TypeError: # we have no bitrate info.
1225 url_list = jsonData[fmt]
1228 def check_urls(self, url_list):
1229 """Returns 1st active url from list"""
1230 for url in url_list:
1232 compat_urllib_request.urlopen(url)
1234 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1239 def _print_formats(self, formats):
1240 print('Available formats:')
1241 for fmt in formats.keys():
1242 for b in formats[fmt]:
1244 ext = formats[fmt][b][0]
1245 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
1246 except TypeError: # we have no bitrate info
1247 ext = formats[fmt][0]
1248 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
1251 def _real_extract(self, url):
1252 mobj = re.match(self._VALID_URL, url)
1254 raise ExtractorError(u'Invalid URL: %s' % url)
1255 # extract uploader & filename from url
1256 uploader = mobj.group(1).decode('utf-8')
1257 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
1259 # construct API request
1260 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
1261 # retrieve .json file with links to files
1262 request = compat_urllib_request.Request(file_url)
1264 self.report_download_json(file_url)
1265 jsonData = compat_urllib_request.urlopen(request).read()
1266 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1267 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
1270 json_data = json.loads(jsonData)
1271 player_url = json_data['player_swf_url']
1272 formats = dict(json_data['audio_formats'])
1274 req_format = self._downloader.params.get('format', None)
1277 if self._downloader.params.get('listformats', None):
1278 self._print_formats(formats)
1281 if req_format is None or req_format == 'best':
1282 for format_param in formats.keys():
1283 url_list = self.get_urls(formats, format_param)
1285 file_url = self.check_urls(url_list)
1286 if file_url is not None:
1289 if req_format not in formats:
1290 raise ExtractorError(u'Format is not available')
1292 url_list = self.get_urls(formats, req_format)
1293 file_url = self.check_urls(url_list)
1294 format_param = req_format
1297 'id': file_id.decode('utf-8'),
1298 'url': file_url.decode('utf-8'),
1299 'uploader': uploader.decode('utf-8'),
1300 'upload_date': None,
1301 'title': json_data['name'],
1302 'ext': file_url.split('.')[-1].decode('utf-8'),
1303 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1304 'thumbnail': json_data['thumbnail_url'],
1305 'description': json_data['description'],
1306 'player_url': player_url.decode('utf-8'),
1309 class StanfordOpenClassroomIE(InfoExtractor):
1310 """Information extractor for Stanford's Open ClassRoom"""
1312 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
1313 IE_NAME = u'stanfordoc'
1315 def _real_extract(self, url):
1316 mobj = re.match(self._VALID_URL, url)
1318 raise ExtractorError(u'Invalid URL: %s' % url)
1320 if mobj.group('course') and mobj.group('video'): # A specific video
1321 course = mobj.group('course')
1322 video = mobj.group('video')
1324 'id': course + '_' + video,
1326 'upload_date': None,
1329 self.report_extraction(info['id'])
1330 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
1331 xmlUrl = baseUrl + video + '.xml'
1333 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
1334 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1335 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1336 mdoc = xml.etree.ElementTree.fromstring(metaXml)
1338 info['title'] = mdoc.findall('./title')[0].text
1339 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
1341 raise ExtractorError(u'Invalid metadata XML file')
1342 info['ext'] = info['url'].rpartition('.')[2]
1344 elif mobj.group('course'): # A course page
1345 course = mobj.group('course')
1350 'upload_date': None,
1353 coursepage = self._download_webpage(url, info['id'],
1354 note='Downloading course info page',
1355 errnote='Unable to download course info page')
1357 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
1359 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
1360 coursepage, u'description', fatal=False)
1362 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
1365 'type': 'reference',
1366 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
1370 for entry in info['list']:
1371 assert entry['type'] == 'reference'
1372 results += self.extract(entry['url'])
1376 'id': 'Stanford OpenClassroom',
1379 'upload_date': None,
1382 self.report_download_webpage(info['id'])
1383 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
1385 rootpage = compat_urllib_request.urlopen(rootURL).read()
1386 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1387 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
1389 info['title'] = info['id']
1391 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
1394 'type': 'reference',
1395 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
1400 for entry in info['list']:
1401 assert entry['type'] == 'reference'
1402 results += self.extract(entry['url'])
1405 class MTVIE(InfoExtractor):
1406 """Information extractor for MTV.com"""
1408 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
1411 def _real_extract(self, url):
1412 mobj = re.match(self._VALID_URL, url)
1414 raise ExtractorError(u'Invalid URL: %s' % url)
1415 if not mobj.group('proto'):
1416 url = 'http://' + url
1417 video_id = mobj.group('videoid')
1419 webpage = self._download_webpage(url, video_id)
1421 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
1422 webpage, u'song name', fatal=False)
1424 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
1427 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
1428 webpage, u'mtvn_uri', fatal=False)
1430 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
1431 webpage, u'content id', fatal=False)
1433 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
1434 self.report_extraction(video_id)
1435 request = compat_urllib_request.Request(videogen_url)
1437 metadataXml = compat_urllib_request.urlopen(request).read()
1438 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1439 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
1441 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
1442 renditions = mdoc.findall('.//rendition')
1444 # For now, always pick the highest quality.
1445 rendition = renditions[-1]
1448 _,_,ext = rendition.attrib['type'].partition('/')
1449 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
1450 video_url = rendition.find('./src').text
1452 raise ExtractorError('Invalid rendition field.')
1457 'uploader': performer,
1458 'upload_date': None,
1459 'title': video_title,
1467 class YoukuIE(InfoExtractor):
1468 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
1471 nowTime = int(time.time() * 1000)
1472 random1 = random.randint(1000,1998)
1473 random2 = random.randint(1000,9999)
1475 return "%d%d%d" %(nowTime,random1,random2)
1477 def _get_file_ID_mix_string(self, seed):
1479 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
1481 for i in range(len(source)):
1482 seed = (seed * 211 + 30031 ) % 65536
1483 index = math.floor(seed / 65536 * len(source) )
1484 mixed.append(source[int(index)])
1485 source.remove(source[int(index)])
1486 #return ''.join(mixed)
1489 def _get_file_id(self, fileId, seed):
1490 mixed = self._get_file_ID_mix_string(seed)
1491 ids = fileId.split('*')
1495 realId.append(mixed[int(ch)])
1496 return ''.join(realId)
1498 def _real_extract(self, url):
1499 mobj = re.match(self._VALID_URL, url)
1501 raise ExtractorError(u'Invalid URL: %s' % url)
1502 video_id = mobj.group('ID')
1504 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
1506 jsondata = self._download_webpage(info_url, video_id)
1508 self.report_extraction(video_id)
1510 config = json.loads(jsondata)
1512 video_title = config['data'][0]['title']
1513 seed = config['data'][0]['seed']
1515 format = self._downloader.params.get('format', None)
1516 supported_format = list(config['data'][0]['streamfileids'].keys())
1518 if format is None or format == 'best':
1519 if 'hd2' in supported_format:
1524 elif format == 'worst':
1532 fileid = config['data'][0]['streamfileids'][format]
1533 keys = [s['k'] for s in config['data'][0]['segs'][format]]
1534 except (UnicodeDecodeError, ValueError, KeyError):
1535 raise ExtractorError(u'Unable to extract info section')
1538 sid = self._gen_sid()
1539 fileid = self._get_file_id(fileid, seed)
1541 #column 8,9 of fileid represent the segment number
1542 #fileid[7:9] should be changed
1543 for index, key in enumerate(keys):
1545 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
1546 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
1549 'id': '%s_part%02d' % (video_id, index),
1550 'url': download_url,
1552 'upload_date': None,
1553 'title': video_title,
1556 files_info.append(info)
1561 class XNXXIE(InfoExtractor):
1562 """Information extractor for xnxx.com"""
1564 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
1566 VIDEO_URL_RE = r'flv_url=(.*?)&'
1567 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
1568 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
1570 def _real_extract(self, url):
1571 mobj = re.match(self._VALID_URL, url)
1573 raise ExtractorError(u'Invalid URL: %s' % url)
1574 video_id = mobj.group(1)
1576 # Get webpage content
1577 webpage = self._download_webpage(url, video_id)
1579 video_url = self._search_regex(self.VIDEO_URL_RE,
1580 webpage, u'video URL')
1581 video_url = compat_urllib_parse.unquote(video_url)
1583 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
1586 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
1587 webpage, u'thumbnail', fatal=False)
1593 'upload_date': None,
1594 'title': video_title,
1596 'thumbnail': video_thumbnail,
1597 'description': None,
1601 class GooglePlusIE(InfoExtractor):
1602 """Information extractor for plus.google.com."""
1604 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
1605 IE_NAME = u'plus.google'
1607 def _real_extract(self, url):
1608 # Extract id from URL
1609 mobj = re.match(self._VALID_URL, url)
1611 raise ExtractorError(u'Invalid URL: %s' % url)
1613 post_url = mobj.group(0)
1614 video_id = mobj.group(1)
1616 video_extension = 'flv'
1618 # Step 1, Retrieve post webpage to extract further information
1619 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
1621 self.report_extraction(video_id)
1623 # Extract update date
1624 upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
1625 webpage, u'upload date', fatal=False)
1627 # Convert timestring to a format suitable for filename
1628 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
1629 upload_date = upload_date.strftime('%Y%m%d')
1632 uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
1633 webpage, u'uploader', fatal=False)
1636 # Get the first line for title
1637 video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
1638 webpage, 'title', default=u'NA')
1640 # Step 2, Stimulate clicking the image box to launch video
1641 video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
1642 webpage, u'video page URL')
1643 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
1645 # Extract video links on video page
1646 """Extract video links of all sizes"""
1647 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
1648 mobj = re.findall(pattern, webpage)
1650 raise ExtractorError(u'Unable to extract video links')
1652 # Sort in resolution
1653 links = sorted(mobj)
1655 # Choose the lowest of the sort, i.e. highest resolution
1656 video_url = links[-1]
1657 # Only get the url. The resolution part in the tuple has no use anymore
1658 video_url = video_url[-1]
1659 # Treat escaped \u0026 style hex
1661 video_url = video_url.decode("unicode_escape")
1662 except AttributeError: # Python 3
1663 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
1669 'uploader': uploader,
1670 'upload_date': upload_date,
1671 'title': video_title,
1672 'ext': video_extension,
1675 class NBAIE(InfoExtractor):
1676 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
1679 def _real_extract(self, url):
1680 mobj = re.match(self._VALID_URL, url)
1682 raise ExtractorError(u'Invalid URL: %s' % url)
1684 video_id = mobj.group(1)
1686 webpage = self._download_webpage(url, video_id)
1688 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
1690 shortened_video_id = video_id.rpartition('/')[2]
1691 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
1692 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
1694 # It isn't there in the HTML it returns to us
1695 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
1697 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
1700 'id': shortened_video_id,
1704 # 'uploader_date': uploader_date,
1705 'description': description,
1709 class JustinTVIE(InfoExtractor):
1710 """Information extractor for justin.tv and twitch.tv"""
1711 # TODO: One broadcast may be split into multiple videos. The key
1712 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
1713 # starts at 1 and increases. Can we treat all parts as one video?
1715 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
1717 (?P<channelid>[^/]+)|
1718 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
1719 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
1723 _JUSTIN_PAGE_LIMIT = 100
1724 IE_NAME = u'justin.tv'
1726 def report_download_page(self, channel, offset):
1727 """Report attempt to download a single page of videos."""
1728 self.to_screen(u'%s: Downloading video information from %d to %d' %
1729 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
1731 # Return count of items, list of *valid* items
1732 def _parse_page(self, url, video_id):
1733 webpage = self._download_webpage(url, video_id,
1734 u'Downloading video info JSON',
1735 u'unable to download video info JSON')
1737 response = json.loads(webpage)
1738 if type(response) != list:
1739 error_text = response.get('error', 'unknown error')
1740 raise ExtractorError(u'Justin.tv API: %s' % error_text)
1742 for clip in response:
1743 video_url = clip['video_file_url']
1745 video_extension = os.path.splitext(video_url)[1][1:]
1746 video_date = re.sub('-', '', clip['start_time'][:10])
1747 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
1748 video_id = clip['id']
1749 video_title = clip.get('title', video_id)
1753 'title': video_title,
1754 'uploader': clip.get('channel_name', video_uploader_id),
1755 'uploader_id': video_uploader_id,
1756 'upload_date': video_date,
1757 'ext': video_extension,
1759 return (len(response), info)
1761 def _real_extract(self, url):
1762 mobj = re.match(self._VALID_URL, url)
1764 raise ExtractorError(u'invalid URL: %s' % url)
1766 api_base = 'http://api.justin.tv'
1768 if mobj.group('channelid'):
1770 video_id = mobj.group('channelid')
1771 api = api_base + '/channel/archives/%s.json' % video_id
1772 elif mobj.group('chapterid'):
1773 chapter_id = mobj.group('chapterid')
1775 webpage = self._download_webpage(url, chapter_id)
1776 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
1778 raise ExtractorError(u'Cannot find archive of a chapter')
1779 archive_id = m.group(1)
1781 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
1782 chapter_info_xml = self._download_webpage(api, chapter_id,
1783 note=u'Downloading chapter information',
1784 errnote=u'Chapter information download failed')
1785 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
1786 for a in doc.findall('.//archive'):
1787 if archive_id == a.find('./id').text:
1790 raise ExtractorError(u'Could not find chapter in chapter information')
1792 video_url = a.find('./video_file_url').text
1793 video_ext = video_url.rpartition('.')[2] or u'flv'
1795 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
1796 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
1797 note='Downloading chapter metadata',
1798 errnote='Download of chapter metadata failed')
1799 chapter_info = json.loads(chapter_info_json)
1801 bracket_start = int(doc.find('.//bracket_start').text)
1802 bracket_end = int(doc.find('.//bracket_end').text)
1804 # TODO determine start (and probably fix up file)
1805 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
1806 #video_url += u'?start=' + TODO:start_timestamp
1807 # bracket_start is 13290, but we want 51670615
1808 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
1809 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
1812 'id': u'c' + chapter_id,
1815 'title': chapter_info['title'],
1816 'thumbnail': chapter_info['preview'],
1817 'description': chapter_info['description'],
1818 'uploader': chapter_info['channel']['display_name'],
1819 'uploader_id': chapter_info['channel']['name'],
1823 video_id = mobj.group('videoid')
1824 api = api_base + '/broadcast/by_archive/%s.json' % video_id
1826 self.report_extraction(video_id)
1830 limit = self._JUSTIN_PAGE_LIMIT
1833 self.report_download_page(video_id, offset)
1834 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
1835 page_count, page_info = self._parse_page(page_url, video_id)
1836 info.extend(page_info)
1837 if not paged or page_count != limit:
1842 class FunnyOrDieIE(InfoExtractor):
1843 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
1845 def _real_extract(self, url):
1846 mobj = re.match(self._VALID_URL, url)
1848 raise ExtractorError(u'invalid URL: %s' % url)
1850 video_id = mobj.group('id')
1851 webpage = self._download_webpage(url, video_id)
1853 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
1854 webpage, u'video URL', flags=re.DOTALL)
1856 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
1857 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
1859 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1860 webpage, u'description', fatal=False, flags=re.DOTALL)
1867 'description': video_description,
1871 class SteamIE(InfoExtractor):
1872 _VALID_URL = r"""http://store\.steampowered\.com/
1874 (?P<urltype>video|app)/ #If the page is only for videos or for a game
1876 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
1878 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
1879 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
1882 def suitable(cls, url):
1883 """Receives a URL and returns True if suitable for this IE."""
1884 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1886 def _real_extract(self, url):
1887 m = re.match(self._VALID_URL, url, re.VERBOSE)
1888 gameID = m.group('gameID')
1890 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
1891 webpage = self._download_webpage(videourl, gameID)
1893 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
1894 videourl = self._AGECHECK_TEMPLATE % gameID
1895 self.report_age_confirmation()
1896 webpage = self._download_webpage(videourl, gameID)
1898 self.report_extraction(gameID)
1899 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
1900 webpage, 'game title')
1902 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
1903 mweb = re.finditer(urlRE, webpage)
1904 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
1905 titles = re.finditer(namesRE, webpage)
1906 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
1907 thumbs = re.finditer(thumbsRE, webpage)
1909 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
1910 video_id = vid.group('videoID')
1911 title = vtitle.group('videoName')
1912 video_url = vid.group('videoURL')
1913 video_thumb = thumb.group('thumbnail')
1915 raise ExtractorError(u'Cannot find video url for %s' % video_id)
1920 'title': unescapeHTML(title),
1921 'thumbnail': video_thumb
1924 return [self.playlist_result(videos, gameID, game_title)]
1926 class UstreamIE(InfoExtractor):
1927 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
1928 IE_NAME = u'ustream'
1930 def _real_extract(self, url):
1931 m = re.match(self._VALID_URL, url)
1932 video_id = m.group('videoID')
1934 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
1935 webpage = self._download_webpage(url, video_id)
1937 self.report_extraction(video_id)
1939 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
1942 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
1943 webpage, u'uploader', fatal=False, flags=re.DOTALL)
1945 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
1946 webpage, u'thumbnail', fatal=False)
1952 'title': video_title,
1953 'uploader': uploader,
1954 'thumbnail': thumbnail,
1958 class WorldStarHipHopIE(InfoExtractor):
1959 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
1960 IE_NAME = u'WorldStarHipHop'
1962 def _real_extract(self, url):
1963 m = re.match(self._VALID_URL, url)
1964 video_id = m.group('id')
1966 webpage_src = self._download_webpage(url, video_id)
1968 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
1969 webpage_src, u'video URL')
1971 if 'mp4' in video_url:
1976 video_title = self._html_search_regex(r"<title>(.*)</title>",
1977 webpage_src, u'title')
1979 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
1980 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
1981 webpage_src, u'thumbnail', fatal=False)
1984 _title = r"""candytitles.*>(.*)</span>"""
1985 mobj = re.search(_title, webpage_src)
1986 if mobj is not None:
1987 video_title = mobj.group(1)
1992 'title' : video_title,
1993 'thumbnail' : thumbnail,
1998 class RBMARadioIE(InfoExtractor):
1999 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
2001 def _real_extract(self, url):
2002 m = re.match(self._VALID_URL, url)
2003 video_id = m.group('videoID')
2005 webpage = self._download_webpage(url, video_id)
2007 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
2008 webpage, u'json data', flags=re.MULTILINE)
2011 data = json.loads(json_data)
2012 except ValueError as e:
2013 raise ExtractorError(u'Invalid JSON: ' + str(e))
2015 video_url = data['akamai_url'] + '&cbr=256'
2016 url_parts = compat_urllib_parse_urlparse(video_url)
2017 video_ext = url_parts.path.rpartition('.')[2]
2022 'title': data['title'],
2023 'description': data.get('teaser_text'),
2024 'location': data.get('country_of_origin'),
2025 'uploader': data.get('host', {}).get('name'),
2026 'uploader_id': data.get('host', {}).get('slug'),
2027 'thumbnail': data.get('image', {}).get('large_url_2x'),
2028 'duration': data.get('duration'),
2033 class YouPornIE(InfoExtractor):
2034 """Information extractor for youporn.com."""
2035 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
2037 def _print_formats(self, formats):
2038 """Print all available formats"""
2039 print(u'Available formats:')
2040 print(u'ext\t\tformat')
2041 print(u'---------------------------------')
2042 for format in formats:
2043 print(u'%s\t\t%s' % (format['ext'], format['format']))
2045 def _specific(self, req_format, formats):
2047 if(x["format"]==req_format):
2051 def _real_extract(self, url):
2052 mobj = re.match(self._VALID_URL, url)
2054 raise ExtractorError(u'Invalid URL: %s' % url)
2055 video_id = mobj.group('videoid')
2057 req = compat_urllib_request.Request(url)
2058 req.add_header('Cookie', 'age_verified=1')
2059 webpage = self._download_webpage(req, video_id)
2061 # Get JSON parameters
2062 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
2064 params = json.loads(json_params)
2066 raise ExtractorError(u'Invalid JSON')
2068 self.report_extraction(video_id)
2070 video_title = params['title']
2071 upload_date = unified_strdate(params['release_date_f'])
2072 video_description = params['description']
2073 video_uploader = params['submitted_by']
2074 thumbnail = params['thumbnails'][0]['image']
2076 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
2078 # Get all of the formats available
2079 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
2080 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
2081 webpage, u'download list').strip()
2083 # Get all of the links from the page
2084 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
2085 links = re.findall(LINK_RE, download_list_html)
2086 if(len(links) == 0):
2087 raise ExtractorError(u'ERROR: no known formats available for video')
2089 self.to_screen(u'Links found: %d' % len(links))
2094 # A link looks like this:
2095 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
2096 # A path looks like this:
2097 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
2098 video_url = unescapeHTML( link )
2099 path = compat_urllib_parse_urlparse( video_url ).path
2100 extension = os.path.splitext( path )[1][1:]
2101 format = path.split('/')[4].split('_')[:2]
2104 format = "-".join( format )
2105 # title = u'%s-%s-%s' % (video_title, size, bitrate)
2110 'uploader': video_uploader,
2111 'upload_date': upload_date,
2112 'title': video_title,
2115 'thumbnail': thumbnail,
2116 'description': video_description
2119 if self._downloader.params.get('listformats', None):
2120 self._print_formats(formats)
2123 req_format = self._downloader.params.get('format', None)
2124 self.to_screen(u'Format: %s' % req_format)
2126 if req_format is None or req_format == 'best':
2128 elif req_format == 'worst':
2129 return [formats[-1]]
2130 elif req_format in ('-1', 'all'):
2133 format = self._specific( req_format, formats )
2135 raise ExtractorError(u'Requested format not available')
2140 class PornotubeIE(InfoExtractor):
2141 """Information extractor for pornotube.com."""
2142 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
2144 def _real_extract(self, url):
2145 mobj = re.match(self._VALID_URL, url)
2147 raise ExtractorError(u'Invalid URL: %s' % url)
2149 video_id = mobj.group('videoid')
2150 video_title = mobj.group('title')
2152 # Get webpage content
2153 webpage = self._download_webpage(url, video_id)
2156 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
2157 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
2158 video_url = compat_urllib_parse.unquote(video_url)
2160 #Get the uploaded date
2161 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
2162 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
2163 if upload_date: upload_date = unified_strdate(upload_date)
2165 info = {'id': video_id,
2168 'upload_date': upload_date,
2169 'title': video_title,
2175 class YouJizzIE(InfoExtractor):
2176 """Information extractor for youjizz.com."""
2177 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
2179 def _real_extract(self, url):
2180 mobj = re.match(self._VALID_URL, url)
2182 raise ExtractorError(u'Invalid URL: %s' % url)
2184 video_id = mobj.group('videoid')
2186 # Get webpage content
2187 webpage = self._download_webpage(url, video_id)
2189 # Get the video title
2190 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
2191 webpage, u'title').strip()
2193 # Get the embed page
2194 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
2196 raise ExtractorError(u'ERROR: unable to extract embed page')
2198 embed_page_url = result.group(0).strip()
2199 video_id = result.group('videoid')
2201 webpage = self._download_webpage(embed_page_url, video_id)
2204 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
2205 webpage, u'video URL')
2207 info = {'id': video_id,
2209 'title': video_title,
2212 'player_url': embed_page_url}
2216 class EightTracksIE(InfoExtractor):
2218 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
2220 def _real_extract(self, url):
2221 mobj = re.match(self._VALID_URL, url)
2223 raise ExtractorError(u'Invalid URL: %s' % url)
2224 playlist_id = mobj.group('id')
2226 webpage = self._download_webpage(url, playlist_id)
2228 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
2229 data = json.loads(json_like)
2231 session = str(random.randint(0, 1000000000))
2233 track_count = data['tracks_count']
2234 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
2235 next_url = first_url
2237 for i in itertools.count():
2238 api_json = self._download_webpage(next_url, playlist_id,
2239 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
2240 errnote=u'Failed to download song information')
2241 api_data = json.loads(api_json)
2242 track_data = api_data[u'set']['track']
2244 'id': track_data['id'],
2245 'url': track_data['track_file_stream_url'],
2246 'title': track_data['performer'] + u' - ' + track_data['name'],
2247 'raw_title': track_data['name'],
2248 'uploader_id': data['user']['login'],
2252 if api_data['set']['at_last_track']:
2254 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
2257 class KeekIE(InfoExtractor):
2258 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
2261 def _real_extract(self, url):
2262 m = re.match(self._VALID_URL, url)
2263 video_id = m.group('videoID')
2265 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
2266 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
2267 webpage = self._download_webpage(url, video_id)
2269 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2272 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
2273 webpage, u'uploader', fatal=False)
2279 'title': video_title,
2280 'thumbnail': thumbnail,
2281 'uploader': uploader
2285 class TEDIE(InfoExtractor):
2286 _VALID_URL=r'''http://www\.ted\.com/
2288 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
2290 ((?P<type_talk>talks)) # We have a simple talk
2292 (/lang/(.*?))? # The url may contain the language
2293 /(?P<name>\w+) # Here goes the name and then ".html"
2297 def suitable(cls, url):
2298 """Receives a URL and returns True if suitable for this IE."""
2299 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2301 def _real_extract(self, url):
2302 m=re.match(self._VALID_URL, url, re.VERBOSE)
2303 if m.group('type_talk'):
2304 return [self._talk_info(url)]
2306 playlist_id=m.group('playlist_id')
2307 name=m.group('name')
2308 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
2309 return [self._playlist_videos_info(url,name,playlist_id)]
2311 def _playlist_videos_info(self,url,name,playlist_id=0):
2312 '''Returns the videos of the playlist'''
2314 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
2315 ([.\s]*?)data-playlist_item_id="(\d+)"
2316 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
2318 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
2319 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
2320 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
2321 m_names=re.finditer(video_name_RE,webpage)
2323 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
2324 webpage, 'playlist title')
2326 playlist_entries = []
2327 for m_video, m_name in zip(m_videos,m_names):
2328 video_id=m_video.group('video_id')
2329 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
2330 playlist_entries.append(self.url_result(talk_url, 'TED'))
2331 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
2333 def _talk_info(self, url, video_id=0):
2334 """Return the video for the talk in the url"""
2335 m = re.match(self._VALID_URL, url,re.VERBOSE)
2336 video_name = m.group('name')
2337 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
2338 self.report_extraction(video_name)
2339 # If the url includes the language we get the title translated
2340 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
2342 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
2343 webpage, 'json data')
2344 info = json.loads(json_data)
2345 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
2346 webpage, 'description', flags = re.DOTALL)
2348 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
2349 webpage, 'thumbnail')
2352 'url': info['htmlStreams'][-1]['file'],
2355 'thumbnail': thumbnail,
2356 'description': desc,
2360 class MySpassIE(InfoExtractor):
2361 _VALID_URL = r'http://www.myspass.de/.*'
2363 def _real_extract(self, url):
2364 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
2366 # video id is the last path element of the URL
2367 # usually there is a trailing slash, so also try the second but last
2368 url_path = compat_urllib_parse_urlparse(url).path
2369 url_parent_path, video_id = os.path.split(url_path)
2371 _, video_id = os.path.split(url_parent_path)
2374 metadata_url = META_DATA_URL_TEMPLATE % video_id
2375 metadata_text = self._download_webpage(metadata_url, video_id)
2376 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
2378 # extract values from metadata
2379 url_flv_el = metadata.find('url_flv')
2380 if url_flv_el is None:
2381 raise ExtractorError(u'Unable to extract download url')
2382 video_url = url_flv_el.text
2383 extension = os.path.splitext(video_url)[1][1:]
2384 title_el = metadata.find('title')
2385 if title_el is None:
2386 raise ExtractorError(u'Unable to extract title')
2387 title = title_el.text
2388 format_id_el = metadata.find('format_id')
2389 if format_id_el is None:
2392 format = format_id_el.text
2393 description_el = metadata.find('description')
2394 if description_el is not None:
2395 description = description_el.text
2398 imagePreview_el = metadata.find('imagePreview')
2399 if imagePreview_el is not None:
2400 thumbnail = imagePreview_el.text
2409 'thumbnail': thumbnail,
2410 'description': description
2414 class SpiegelIE(InfoExtractor):
2415 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
2417 def _real_extract(self, url):
2418 m = re.match(self._VALID_URL, url)
2419 video_id = m.group('videoID')
2421 webpage = self._download_webpage(url, video_id)
2423 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
2426 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
2427 xml_code = self._download_webpage(xml_url, video_id,
2428 note=u'Downloading XML', errnote=u'Failed to download XML')
2430 idoc = xml.etree.ElementTree.fromstring(xml_code)
2431 last_type = idoc[-1]
2432 filename = last_type.findall('./filename')[0].text
2433 duration = float(last_type.findall('./duration')[0].text)
2435 video_url = 'http://video2.spiegel.de/flash/' + filename
2436 video_ext = filename.rpartition('.')[2]
2441 'title': video_title,
2442 'duration': duration,
2446 class LiveLeakIE(InfoExtractor):
2448 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
2449 IE_NAME = u'liveleak'
2451 def _real_extract(self, url):
2452 mobj = re.match(self._VALID_URL, url)
2454 raise ExtractorError(u'Invalid URL: %s' % url)
2456 video_id = mobj.group('video_id')
2458 webpage = self._download_webpage(url, video_id)
2460 video_url = self._search_regex(r'file: "(.*?)",',
2461 webpage, u'video URL')
2463 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2464 webpage, u'title').replace('LiveLeak.com -', '').strip()
2466 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
2467 webpage, u'description', fatal=False)
2469 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
2470 webpage, u'uploader', fatal=False)
2476 'title': video_title,
2477 'description': video_description,
2478 'uploader': video_uploader
2485 class TumblrIE(InfoExtractor):
2486 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
2488 def _real_extract(self, url):
2489 m_url = re.match(self._VALID_URL, url)
2490 video_id = m_url.group('id')
2491 blog = m_url.group('blog_name')
2493 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
2494 webpage = self._download_webpage(url, video_id)
2496 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
2497 video = re.search(re_video, webpage)
2499 raise ExtractorError(u'Unable to extract video')
2500 video_url = video.group('video_url')
2501 ext = video.group('ext')
2503 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
2504 webpage, u'thumbnail', fatal=False) # We pick the first poster
2505 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
2507 # The only place where you can get a title, it's not complete,
2508 # but searching in other places doesn't work for all videos
2509 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
2510 webpage, u'title', flags=re.DOTALL)
2512 return [{'id': video_id,
2514 'title': video_title,
2515 'thumbnail': video_thumbnail,
2519 class BandcampIE(InfoExtractor):
2520 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
2522 def _real_extract(self, url):
2523 mobj = re.match(self._VALID_URL, url)
2524 title = mobj.group('title')
2525 webpage = self._download_webpage(url, title)
2526 # We get the link to the free download page
2527 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
2528 if m_download is None:
2529 raise ExtractorError(u'No free songs found')
2531 download_link = m_download.group(1)
2532 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
2533 webpage, re.MULTILINE|re.DOTALL).group('id')
2535 download_webpage = self._download_webpage(download_link, id,
2536 'Downloading free downloads page')
2537 # We get the dictionary of the track from some javascrip code
2538 info = re.search(r'items: (.*?),$',
2539 download_webpage, re.MULTILINE).group(1)
2540 info = json.loads(info)[0]
2541 # We pick mp3-320 for now, until format selection can be easily implemented.
2542 mp3_info = info[u'downloads'][u'mp3-320']
2543 # If we try to use this url it says the link has expired
2544 initial_url = mp3_info[u'url']
2545 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
2546 m_url = re.match(re_url, initial_url)
2547 #We build the url we will use to get the final track url
2548 # This url is build in Bandcamp in the script download_bunde_*.js
2549 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
2550 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
2551 # If we could correctly generate the .rand field the url would be
2552 #in the "download_url" key
2553 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
2555 track_info = {'id':id,
2556 'title' : info[u'title'],
2559 'thumbnail' : info[u'thumb_url'],
2560 'uploader' : info[u'artist']
2565 class RedTubeIE(InfoExtractor):
2566 """Information Extractor for redtube"""
2567 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
2569 def _real_extract(self,url):
2570 mobj = re.match(self._VALID_URL, url)
2572 raise ExtractorError(u'Invalid URL: %s' % url)
2574 video_id = mobj.group('id')
2575 video_extension = 'mp4'
2576 webpage = self._download_webpage(url, video_id)
2578 self.report_extraction(video_id)
2580 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
2581 webpage, u'video URL')
2583 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
2589 'ext': video_extension,
2590 'title': video_title,
2593 class InaIE(InfoExtractor):
2594 """Information Extractor for Ina.fr"""
2595 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
2597 def _real_extract(self,url):
2598 mobj = re.match(self._VALID_URL, url)
2600 video_id = mobj.group('id')
2601 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
2602 video_extension = 'mp4'
2603 webpage = self._download_webpage(mrss_url, video_id)
2605 self.report_extraction(video_id)
2607 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
2608 webpage, u'video URL')
2610 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
2616 'ext': video_extension,
2617 'title': video_title,
2620 class HowcastIE(InfoExtractor):
2621 """Information Extractor for Howcast.com"""
2622 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
2624 def _real_extract(self, url):
2625 mobj = re.match(self._VALID_URL, url)
2627 video_id = mobj.group('id')
2628 webpage_url = 'http://www.howcast.com/videos/' + video_id
2629 webpage = self._download_webpage(webpage_url, video_id)
2631 self.report_extraction(video_id)
2633 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
2634 webpage, u'video URL')
2636 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
2639 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
2640 webpage, u'description', fatal=False)
2642 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
2643 webpage, u'thumbnail', fatal=False)
2649 'title': video_title,
2650 'description': video_description,
2651 'thumbnail': thumbnail,
2654 class VineIE(InfoExtractor):
2655 """Information Extractor for Vine.co"""
2656 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
2658 def _real_extract(self, url):
2659 mobj = re.match(self._VALID_URL, url)
2661 video_id = mobj.group('id')
2662 webpage_url = 'https://vine.co/v/' + video_id
2663 webpage = self._download_webpage(webpage_url, video_id)
2665 self.report_extraction(video_id)
2667 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
2668 webpage, u'video URL')
2670 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2673 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
2674 webpage, u'thumbnail', fatal=False)
2676 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
2677 webpage, u'uploader', fatal=False, flags=re.DOTALL)
2683 'title': video_title,
2684 'thumbnail': thumbnail,
2685 'uploader': uploader,
2688 class FlickrIE(InfoExtractor):
2689 """Information Extractor for Flickr videos"""
2690 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
2692 def _real_extract(self, url):
2693 mobj = re.match(self._VALID_URL, url)
2695 video_id = mobj.group('id')
2696 video_uploader_id = mobj.group('uploader_id')
2697 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
2698 webpage = self._download_webpage(webpage_url, video_id)
2700 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
2702 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
2703 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
2705 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
2706 first_xml, u'node_id')
2708 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
2709 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
2711 self.report_extraction(video_id)
2713 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
2715 raise ExtractorError(u'Unable to extract video url')
2716 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
2718 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
2719 webpage, u'video title')
2721 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
2722 webpage, u'description', fatal=False)
2724 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
2725 webpage, u'thumbnail', fatal=False)
2731 'title': video_title,
2732 'description': video_description,
2733 'thumbnail': thumbnail,
2734 'uploader_id': video_uploader_id,
2737 class TeamcocoIE(InfoExtractor):
2738 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
2740 def _real_extract(self, url):
2741 mobj = re.match(self._VALID_URL, url)
2743 raise ExtractorError(u'Invalid URL: %s' % url)
2744 url_title = mobj.group('url_title')
2745 webpage = self._download_webpage(url, url_title)
2747 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
2748 webpage, u'video id')
2750 self.report_extraction(video_id)
2752 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2755 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
2756 webpage, u'thumbnail', fatal=False)
2758 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
2759 webpage, u'description', fatal=False)
2761 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
2762 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
2764 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
2771 'title': video_title,
2772 'thumbnail': thumbnail,
2773 'description': video_description,
2776 class XHamsterIE(InfoExtractor):
2777 """Information Extractor for xHamster"""
2778 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
2780 def _real_extract(self,url):
2781 mobj = re.match(self._VALID_URL, url)
2783 video_id = mobj.group('id')
2784 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
2785 webpage = self._download_webpage(mrss_url, video_id)
2787 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
2789 raise ExtractorError(u'Unable to extract media URL')
2790 if len(mobj.group('server')) == 0:
2791 video_url = compat_urllib_parse.unquote(mobj.group('file'))
2793 video_url = mobj.group('server')+'/key='+mobj.group('file')
2794 video_extension = video_url.split('.')[-1]
2796 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
2799 # Can't see the description anywhere in the UI
2800 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
2801 # webpage, u'description', fatal=False)
2802 # if video_description: video_description = unescapeHTML(video_description)
2804 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
2806 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
2808 video_upload_date = None
2809 self._downloader.report_warning(u'Unable to extract upload date')
2811 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
2812 webpage, u'uploader id', default=u'anonymous')
2814 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
2815 webpage, u'thumbnail', fatal=False)
2820 'ext': video_extension,
2821 'title': video_title,
2822 # 'description': video_description,
2823 'upload_date': video_upload_date,
2824 'uploader_id': video_uploader_id,
2825 'thumbnail': video_thumbnail
2828 class HypemIE(InfoExtractor):
2829 """Information Extractor for hypem"""
2830 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
2832 def _real_extract(self, url):
2833 mobj = re.match(self._VALID_URL, url)
2835 raise ExtractorError(u'Invalid URL: %s' % url)
2836 track_id = mobj.group(1)
2838 data = { 'ax': 1, 'ts': time.time() }
2839 data_encoded = compat_urllib_parse.urlencode(data)
2840 complete_url = url + "?" + data_encoded
2841 request = compat_urllib_request.Request(complete_url)
2842 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
2843 cookie = urlh.headers.get('Set-Cookie', '')
2845 self.report_extraction(track_id)
2847 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
2848 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
2850 track_list = json.loads(html_tracks)
2851 track = track_list[u'tracks'][0]
2853 raise ExtractorError(u'Hypemachine contained invalid JSON.')
2856 track_id = track[u"id"]
2857 artist = track[u"artist"]
2858 title = track[u"song"]
2860 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
2861 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
2862 request.add_header('cookie', cookie)
2863 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
2865 song_data = json.loads(song_data_json)
2867 raise ExtractorError(u'Hypemachine contained invalid JSON.')
2868 final_url = song_data[u"url"]
2878 class Vbox7IE(InfoExtractor):
2879 """Information Extractor for Vbox7"""
2880 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
2882 def _real_extract(self,url):
2883 mobj = re.match(self._VALID_URL, url)
2885 raise ExtractorError(u'Invalid URL: %s' % url)
2886 video_id = mobj.group(1)
2888 redirect_page, urlh = self._download_webpage_handle(url, video_id)
2889 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
2890 redirect_url = urlh.geturl() + new_location
2891 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
2893 title = self._html_search_regex(r'<title>(.*)</title>',
2894 webpage, u'title').split('/')[0].strip()
2897 info_url = "http://vbox7.com/play/magare.do"
2898 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
2899 info_request = compat_urllib_request.Request(info_url, data)
2900 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
2901 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
2902 if info_response is None:
2903 raise ExtractorError(u'Unable to extract the media url')
2904 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
2911 'thumbnail': thumbnail_url,
2915 def gen_extractors():
2916 """ Return a list of an instance of every supported extractor.
2917 The order does matter; the first extractor matched is the one handling the URL.
2920 YoutubePlaylistIE(),
2945 StanfordOpenClassroomIE(),
2955 WorldStarHipHopIE(),
2985 def get_info_extractor(ie_name):
2986 """Returns the info extractor class with the given ie_name"""
2987 return globals()[ie_name+'IE']