2 # -*- coding: utf-8 -*-
4 from __future__ import absolute_import
15 import xml.etree.ElementTree
24 from .extractor.common import InfoExtractor, SearchInfoExtractor
26 from .extractor.ard import ARDIE
27 from .extractor.arte import ArteTvIE
28 from .extractor.dailymotion import DailymotionIE
29 from .extractor.metacafe import MetacafeIE
30 from .extractor.statigram import StatigramIE
31 from .extractor.photobucket import PhotobucketIE
32 from .extractor.vimeo import VimeoIE
33 from .extractor.yahoo import YahooIE
34 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
35 from .extractor.zdf import ZDFIE
47 class GenericIE(InfoExtractor):
48 """Generic last-resort information extractor."""
53 def report_download_webpage(self, video_id):
54 """Report webpage download."""
55 if not self._downloader.params.get('test', False):
56 self._downloader.report_warning(u'Falling back on generic information extractor.')
57 super(GenericIE, self).report_download_webpage(video_id)
59 def report_following_redirect(self, new_url):
60 """Report information extraction."""
61 self._downloader.to_screen(u'[redirect] Following redirect to %s' % new_url)
63 def _test_redirect(self, url):
64 """Check if it is a redirect, like url shorteners, in case return the new url."""
65 class HeadRequest(compat_urllib_request.Request):
69 class HEADRedirectHandler(compat_urllib_request.HTTPRedirectHandler):
71 Subclass the HTTPRedirectHandler to make it use our
72 HeadRequest also on the redirected URL
74 def redirect_request(self, req, fp, code, msg, headers, newurl):
75 if code in (301, 302, 303, 307):
76 newurl = newurl.replace(' ', '%20')
77 newheaders = dict((k,v) for k,v in req.headers.items()
78 if k.lower() not in ("content-length", "content-type"))
79 return HeadRequest(newurl,
81 origin_req_host=req.get_origin_req_host(),
84 raise compat_urllib_error.HTTPError(req.get_full_url(), code, msg, headers, fp)
86 class HTTPMethodFallback(compat_urllib_request.BaseHandler):
88 Fallback to GET if HEAD is not allowed (405 HTTP error)
90 def http_error_405(self, req, fp, code, msg, headers):
94 newheaders = dict((k,v) for k,v in req.headers.items()
95 if k.lower() not in ("content-length", "content-type"))
96 return self.parent.open(compat_urllib_request.Request(req.get_full_url(),
98 origin_req_host=req.get_origin_req_host(),
102 opener = compat_urllib_request.OpenerDirector()
103 for handler in [compat_urllib_request.HTTPHandler, compat_urllib_request.HTTPDefaultErrorHandler,
104 HTTPMethodFallback, HEADRedirectHandler,
105 compat_urllib_request.HTTPErrorProcessor, compat_urllib_request.HTTPSHandler]:
106 opener.add_handler(handler())
108 response = opener.open(HeadRequest(url))
110 raise ExtractorError(u'Invalid URL protocol')
111 new_url = response.geturl()
116 self.report_following_redirect(new_url)
119 def _real_extract(self, url):
120 new_url = self._test_redirect(url)
121 if new_url: return [self.url_result(new_url)]
123 video_id = url.split('/')[-1]
125 webpage = self._download_webpage(url, video_id)
126 except ValueError as err:
127 # since this is the last-resort InfoExtractor, if
128 # this error is thrown, it'll be thrown here
129 raise ExtractorError(u'Invalid URL: %s' % url)
131 self.report_extraction(video_id)
132 # Start with something easy: JW Player in SWFObject
133 mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
135 # Broaden the search a little bit
136 mobj = re.search(r'[^A-Za-z0-9]?(?:file|source)=(http[^\'"&]*)', webpage)
138 # Broaden the search a little bit: JWPlayer JS loader
139 mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
141 # Try to find twitter cards info
142 mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
144 # We look for Open Graph info:
145 # We have to match any number spaces between elements, some sites try to align them (eg.: statigr.am)
146 m_video_type = re.search(r'<meta.*?property="og:video:type".*?content="video/(.*?)"', webpage)
147 # We only look in og:video if the MIME type is a video, don't try if it's a Flash player:
148 if m_video_type is not None:
149 mobj = re.search(r'<meta.*?property="og:video".*?content="(.*?)"', webpage)
151 raise ExtractorError(u'Invalid URL: %s' % url)
153 # It's possible that one of the regexes
154 # matched, but returned an empty group:
155 if mobj.group(1) is None:
156 raise ExtractorError(u'Invalid URL: %s' % url)
158 video_url = compat_urllib_parse.unquote(mobj.group(1))
159 video_id = os.path.basename(video_url)
161 # here's a fun little line of code for you:
162 video_extension = os.path.splitext(video_id)[1][1:]
163 video_id = os.path.splitext(video_id)[0]
165 # it's tempting to parse this further, but you would
166 # have to take into account all the variations like
167 # Video Title - Site Name
168 # Site Name | Video Title
169 # Video Title - Tagline | Site Name
170 # and so on and so forth; it's just not practical
171 video_title = self._html_search_regex(r'<title>(.*)</title>',
172 webpage, u'video title')
174 # video uploader is domain name
175 video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
176 url, u'video uploader')
181 'uploader': video_uploader,
183 'title': video_title,
184 'ext': video_extension,
189 class GoogleSearchIE(SearchInfoExtractor):
190 """Information Extractor for Google Video search queries."""
191 _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
193 IE_NAME = u'video.google:search'
194 _SEARCH_KEY = 'gvsearch'
196 def _get_n_results(self, query, n):
197 """Get a specified number of results for a query"""
205 for pagenum in itertools.count(1):
206 result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
207 webpage = self._download_webpage(result_url, u'gvsearch:' + query,
208 note='Downloading result page ' + str(pagenum))
210 for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
215 res['entries'].append(e)
217 if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
220 class YahooSearchIE(SearchInfoExtractor):
221 """Information Extractor for Yahoo! Video search queries."""
224 IE_NAME = u'screen.yahoo:search'
225 _SEARCH_KEY = 'yvsearch'
227 def _get_n_results(self, query, n):
228 """Get a specified number of results for a query"""
235 for pagenum in itertools.count(0):
236 result_url = u'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse.quote_plus(query), pagenum * 30)
237 webpage = self._download_webpage(result_url, query,
238 note='Downloading results page '+str(pagenum+1))
239 info = json.loads(webpage)
241 results = info[u'results']
243 for (i, r) in enumerate(results):
244 if (pagenum * 30) +i >= n:
246 mobj = re.search(r'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r)
247 e = self.url_result('http://' + mobj.group('url'), 'Yahoo')
248 res['entries'].append(e)
249 if (pagenum * 30 +i >= n) or (m[u'last'] >= (m[u'total'] -1 )):
255 class BlipTVUserIE(InfoExtractor):
256 """Information Extractor for blip.tv users."""
258 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?blip\.tv/)|bliptvuser:)([^/]+)/*$'
260 IE_NAME = u'blip.tv:user'
262 def _real_extract(self, url):
264 mobj = re.match(self._VALID_URL, url)
266 raise ExtractorError(u'Invalid URL: %s' % url)
268 username = mobj.group(1)
270 page_base = 'http://m.blip.tv/pr/show_get_full_episode_list?users_id=%s&lite=0&esi=1'
272 page = self._download_webpage(url, username, u'Downloading user page')
273 mobj = re.search(r'data-users-id="([^"]+)"', page)
274 page_base = page_base % mobj.group(1)
277 # Download video ids using BlipTV Ajax calls. Result size per
278 # query is limited (currently to 12 videos) so we need to query
279 # page by page until there are no video ids - it means we got
286 url = page_base + "&page=" + str(pagenum)
287 page = self._download_webpage(url, username,
288 u'Downloading video ids from page %d' % pagenum)
290 # Extract video identifiers
293 for mobj in re.finditer(r'href="/([^"]+)"', page):
294 if mobj.group(1) not in ids_in_page:
295 ids_in_page.append(unescapeHTML(mobj.group(1)))
297 video_ids.extend(ids_in_page)
299 # A little optimization - if current page is not
300 # "full", ie. does not contain PAGE_SIZE video ids then
301 # we can assume that this page is the last one - there
302 # are no more ids on further pages - no need to query
305 if len(ids_in_page) < self._PAGE_SIZE:
310 urls = [u'http://blip.tv/%s' % video_id for video_id in video_ids]
311 url_entries = [self.url_result(url, 'BlipTV') for url in urls]
312 return [self.playlist_result(url_entries, playlist_title = username)]
315 class DepositFilesIE(InfoExtractor):
316 """Information extractor for depositfiles.com"""
318 _VALID_URL = r'(?:http://)?(?:\w+\.)?depositfiles\.com/(?:../(?#locale))?files/(.+)'
320 def _real_extract(self, url):
321 file_id = url.split('/')[-1]
322 # Rebuild url in english locale
323 url = 'http://depositfiles.com/en/files/' + file_id
325 # Retrieve file webpage with 'Free download' button pressed
326 free_download_indication = { 'gateway_result' : '1' }
327 request = compat_urllib_request.Request(url, compat_urllib_parse.urlencode(free_download_indication))
329 self.report_download_webpage(file_id)
330 webpage = compat_urllib_request.urlopen(request).read()
331 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
332 raise ExtractorError(u'Unable to retrieve file webpage: %s' % compat_str(err))
334 # Search for the real file URL
335 mobj = re.search(r'<form action="(http://fileshare.+?)"', webpage)
336 if (mobj is None) or (mobj.group(1) is None):
337 # Try to figure out reason of the error.
338 mobj = re.search(r'<strong>(Attention.*?)</strong>', webpage, re.DOTALL)
339 if (mobj is not None) and (mobj.group(1) is not None):
340 restriction_message = re.sub('\s+', ' ', mobj.group(1)).strip()
341 raise ExtractorError(u'%s' % restriction_message)
343 raise ExtractorError(u'Unable to extract download URL from: %s' % url)
345 file_url = mobj.group(1)
346 file_extension = os.path.splitext(file_url)[1][1:]
348 # Search for file title
349 file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
352 'id': file_id.decode('utf-8'),
353 'url': file_url.decode('utf-8'),
357 'ext': file_extension.decode('utf-8'),
361 class FacebookIE(InfoExtractor):
362 """Information Extractor for Facebook"""
364 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?facebook\.com/(?:video/video|photo)\.php\?(?:.*?)v=(?P<ID>\d+)(?:.*)'
365 _LOGIN_URL = 'https://login.facebook.com/login.php?m&next=http%3A%2F%2Fm.facebook.com%2Fhome.php&'
366 _NETRC_MACHINE = 'facebook'
367 IE_NAME = u'facebook'
369 def report_login(self):
370 """Report attempt to log in."""
371 self.to_screen(u'Logging in')
373 def _real_initialize(self):
374 if self._downloader is None:
379 downloader_params = self._downloader.params
381 # Attempt to use provided username and password or .netrc data
382 if downloader_params.get('username', None) is not None:
383 useremail = downloader_params['username']
384 password = downloader_params['password']
385 elif downloader_params.get('usenetrc', False):
387 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
392 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
393 except (IOError, netrc.NetrcParseError) as err:
394 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
397 if useremail is None:
406 request = compat_urllib_request.Request(self._LOGIN_URL, compat_urllib_parse.urlencode(login_form))
409 login_results = compat_urllib_request.urlopen(request).read()
410 if re.search(r'<form(.*)name="login"(.*)</form>', login_results) is not None:
411 self._downloader.report_warning(u'unable to log in: bad username/password, or exceded login rate limit (~3/min). Check credentials or wait.')
413 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
414 self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
417 def _real_extract(self, url):
418 mobj = re.match(self._VALID_URL, url)
420 raise ExtractorError(u'Invalid URL: %s' % url)
421 video_id = mobj.group('ID')
423 url = 'https://www.facebook.com/video/video.php?v=%s' % video_id
424 webpage = self._download_webpage(url, video_id)
426 BEFORE = '{swf.addParam(param[0], param[1]);});\n'
427 AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});'
428 m = re.search(re.escape(BEFORE) + '(.*?)' + re.escape(AFTER), webpage)
430 raise ExtractorError(u'Cannot parse data')
431 data = dict(json.loads(m.group(1)))
432 params_raw = compat_urllib_parse.unquote(data['params'])
433 params = json.loads(params_raw)
434 video_data = params['video_data'][0]
435 video_url = video_data.get('hd_src')
437 video_url = video_data['sd_src']
439 raise ExtractorError(u'Cannot find video URL')
440 video_duration = int(video_data['video_duration'])
441 thumbnail = video_data['thumbnail_src']
443 video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
448 'title': video_title,
451 'duration': video_duration,
452 'thumbnail': thumbnail,
457 class BlipTVIE(InfoExtractor):
458 """Information extractor for blip.tv"""
460 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?blip\.tv/((.+/)|(play/)|(api\.swf#))(.+)$'
461 _URL_EXT = r'^.*\.([a-z0-9]+)$'
464 def report_direct_download(self, title):
465 """Report information extraction."""
466 self.to_screen(u'%s: Direct download detected' % title)
468 def _real_extract(self, url):
469 mobj = re.match(self._VALID_URL, url)
471 raise ExtractorError(u'Invalid URL: %s' % url)
473 # See https://github.com/rg3/youtube-dl/issues/857
474 api_mobj = re.match(r'http://a\.blip\.tv/api\.swf#(?P<video_id>[\d\w]+)', url)
475 if api_mobj is not None:
476 url = 'http://blip.tv/play/g_%s' % api_mobj.group('video_id')
477 urlp = compat_urllib_parse_urlparse(url)
478 if urlp.path.startswith('/play/'):
479 request = compat_urllib_request.Request(url)
480 response = compat_urllib_request.urlopen(request)
481 redirecturl = response.geturl()
482 rurlp = compat_urllib_parse_urlparse(redirecturl)
483 file_id = compat_parse_qs(rurlp.fragment)['file'][0].rpartition('/')[2]
484 url = 'http://blip.tv/a/a-' + file_id
485 return self._real_extract(url)
492 json_url = url + cchar + 'skin=json&version=2&no_wrap=1'
493 request = compat_urllib_request.Request(json_url)
494 request.add_header('User-Agent', 'iTunes/10.6.1')
495 self.report_extraction(mobj.group(1))
498 urlh = compat_urllib_request.urlopen(request)
499 if urlh.headers.get('Content-Type', '').startswith('video/'): # Direct download
500 basename = url.split('/')[-1]
501 title,ext = os.path.splitext(basename)
502 title = title.decode('UTF-8')
503 ext = ext.replace('.', '')
504 self.report_direct_download(title)
514 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
515 raise ExtractorError(u'ERROR: unable to download video info webpage: %s' % compat_str(err))
516 if info is None: # Regular URL
518 json_code_bytes = urlh.read()
519 json_code = json_code_bytes.decode('utf-8')
520 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
521 raise ExtractorError(u'Unable to read video info webpage: %s' % compat_str(err))
524 json_data = json.loads(json_code)
525 if 'Post' in json_data:
526 data = json_data['Post']
530 upload_date = datetime.datetime.strptime(data['datestamp'], '%m-%d-%y %H:%M%p').strftime('%Y%m%d')
531 video_url = data['media']['url']
532 umobj = re.match(self._URL_EXT, video_url)
534 raise ValueError('Can not determine filename extension')
538 'id': data['item_id'],
540 'uploader': data['display_name'],
541 'upload_date': upload_date,
542 'title': data['title'],
544 'format': data['media']['mimeType'],
545 'thumbnail': data['thumbnailUrl'],
546 'description': data['description'],
547 'player_url': data['embedUrl'],
548 'user_agent': 'iTunes/10.6.1',
550 except (ValueError,KeyError) as err:
551 raise ExtractorError(u'Unable to parse video information: %s' % repr(err))
556 class MyVideoIE(InfoExtractor):
557 """Information Extractor for myvideo.de."""
559 _VALID_URL = r'(?:http://)?(?:www\.)?myvideo\.de/watch/([0-9]+)/([^?/]+).*'
562 # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git
563 # Released into the Public Domain by Tristan Fischer on 2013-05-19
564 # https://github.com/rg3/youtube-dl/pull/842
565 def __rc4crypt(self,data, key):
567 box = list(range(256))
568 for i in list(range(256)):
569 x = (x + box[i] + compat_ord(key[i % len(key)])) % 256
570 box[i], box[x] = box[x], box[i]
576 y = (y + box[x]) % 256
577 box[x], box[y] = box[y], box[x]
578 out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256])
582 return hashlib.md5(s).hexdigest().encode()
584 def _real_extract(self,url):
585 mobj = re.match(self._VALID_URL, url)
587 raise ExtractorError(u'invalid URL: %s' % url)
589 video_id = mobj.group(1)
592 b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt'
593 b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3'
594 b'TnpsbA0KTVRkbU1tSTRNdz09'
598 webpage_url = 'http://www.myvideo.de/watch/%s' % video_id
599 webpage = self._download_webpage(webpage_url, video_id)
601 mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage)
603 self.report_extraction(video_id)
604 video_url = mobj.group(1) + '.flv'
606 video_title = self._html_search_regex('<title>([^<]+)</title>',
609 video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
616 'title': video_title,
621 mobj = re.search('var flashvars={(.+?)}', webpage)
623 raise ExtractorError(u'Unable to extract video')
628 for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec):
629 if not a == '_encxml':
632 encxml = compat_urllib_parse.unquote(b)
633 if not params.get('domain'):
634 params['domain'] = 'www.myvideo.de'
635 xmldata_url = '%s?%s' % (encxml, compat_urllib_parse.urlencode(params))
636 if 'flash_playertype=MTV' in xmldata_url:
637 self._downloader.report_warning(u'avoiding MTV player')
639 'http://www.myvideo.de/dynamic/get_player_video_xml.php'
640 '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes'
644 enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1]
645 enc_data_b = binascii.unhexlify(enc_data)
647 base64.b64decode(base64.b64decode(GK)) +
649 str(video_id).encode('utf-8')
652 dec_data = self.__rc4crypt(enc_data_b, sk)
655 self.report_extraction(video_id)
658 mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
660 video_url = compat_urllib_parse.unquote(mobj.group(1))
661 if 'myvideo2flash' in video_url:
662 self._downloader.report_warning(u'forcing RTMPT ...')
663 video_url = video_url.replace('rtmpe://', 'rtmpt://')
666 # extract non rtmp videos
667 mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
669 raise ExtractorError(u'unable to extract url')
670 video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
672 video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
673 video_file = compat_urllib_parse.unquote(video_file)
675 if not video_file.endswith('f4m'):
676 ppath, prefix = video_file.split('.')
677 video_playpath = '%s:%s' % (prefix, ppath)
678 video_hls_playlist = ''
681 video_hls_playlist = (
682 video_filepath + video_file
683 ).replace('.f4m', '.m3u8')
685 video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
686 video_swfobj = compat_urllib_parse.unquote(video_swfobj)
688 video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
697 'title': video_title,
699 'play_path': video_playpath,
700 'video_file': video_file,
701 'video_hls_playlist': video_hls_playlist,
702 'player_url': video_swfobj,
706 class ComedyCentralIE(InfoExtractor):
707 """Information extractor for The Daily Show and Colbert Report """
709 # urls can be abbreviations like :thedailyshow or :colbert
710 # urls for episodes like:
711 # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day
712 # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news
713 # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524
714 _VALID_URL = r"""^(:(?P<shortname>tds|thedailyshow|cr|colbert|colbertnation|colbertreport)
715 |(https?://)?(www\.)?
716 (?P<showname>thedailyshow|colbertnation)\.com/
717 (full-episodes/(?P<episode>.*)|
719 (the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?))
720 |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
723 _available_formats = ['3500', '2200', '1700', '1200', '750', '400']
725 _video_extensions = {
733 _video_dimensions = {
743 def suitable(cls, url):
744 """Receives a URL and returns True if suitable for this IE."""
745 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
747 def _print_formats(self, formats):
748 print('Available formats:')
750 print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'mp4'), self._video_dimensions.get(x, '???')))
753 def _real_extract(self, url):
754 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
756 raise ExtractorError(u'Invalid URL: %s' % url)
758 if mobj.group('shortname'):
759 if mobj.group('shortname') in ('tds', 'thedailyshow'):
760 url = u'http://www.thedailyshow.com/full-episodes/'
762 url = u'http://www.colbertnation.com/full-episodes/'
763 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
764 assert mobj is not None
766 if mobj.group('clip'):
767 if mobj.group('showname') == 'thedailyshow':
768 epTitle = mobj.group('tdstitle')
770 epTitle = mobj.group('cntitle')
773 dlNewest = not mobj.group('episode')
775 epTitle = mobj.group('showname')
777 epTitle = mobj.group('episode')
779 self.report_extraction(epTitle)
780 webpage,htmlHandle = self._download_webpage_handle(url, epTitle)
782 url = htmlHandle.geturl()
783 mobj = re.match(self._VALID_URL, url, re.VERBOSE)
785 raise ExtractorError(u'Invalid redirected URL: ' + url)
786 if mobj.group('episode') == '':
787 raise ExtractorError(u'Redirected URL is still not specific: ' + url)
788 epTitle = mobj.group('episode')
790 mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage)
792 if len(mMovieParams) == 0:
793 # The Colbert Report embeds the information in a without
794 # a URL prefix; so extract the alternate reference
795 # and then add the URL prefix manually.
797 altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video).*?:.*?)"', webpage)
798 if len(altMovieParams) == 0:
799 raise ExtractorError(u'unable to find Flash URL in webpage ' + url)
801 mMovieParams = [("http://media.mtvnservices.com/" + altMovieParams[0], altMovieParams[0])]
803 uri = mMovieParams[0][1]
804 indexUrl = 'http://shadow.comedycentral.com/feeds/video_player/mrss/?' + compat_urllib_parse.urlencode({'uri': uri})
805 indexXml = self._download_webpage(indexUrl, epTitle,
806 u'Downloading show index',
807 u'unable to download episode index')
811 idoc = xml.etree.ElementTree.fromstring(indexXml)
812 itemEls = idoc.findall('.//item')
813 for partNum,itemEl in enumerate(itemEls):
814 mediaId = itemEl.findall('./guid')[0].text
815 shortMediaId = mediaId.split(':')[-1]
816 showId = mediaId.split(':')[-2].replace('.com', '')
817 officialTitle = itemEl.findall('./title')[0].text
818 officialDate = unified_strdate(itemEl.findall('./pubDate')[0].text)
820 configUrl = ('http://www.comedycentral.com/global/feeds/entertainment/media/mediaGenEntertainment.jhtml?' +
821 compat_urllib_parse.urlencode({'uri': mediaId}))
822 configXml = self._download_webpage(configUrl, epTitle,
823 u'Downloading configuration for %s' % shortMediaId)
825 cdoc = xml.etree.ElementTree.fromstring(configXml)
827 for rendition in cdoc.findall('.//rendition'):
828 finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text)
832 self._downloader.report_error(u'unable to download ' + mediaId + ': No videos found')
835 if self._downloader.params.get('listformats', None):
836 self._print_formats([i[0] for i in turls])
839 # For now, just pick the highest bitrate
840 format,rtmp_video_url = turls[-1]
842 # Get the format arg from the arg stream
843 req_format = self._downloader.params.get('format', None)
845 # Select format if we can find one
848 format, rtmp_video_url = f, v
851 m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp.comedystor/.*)$', rtmp_video_url)
853 raise ExtractorError(u'Cannot transform RTMP url')
854 base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
855 video_url = base + m.group('finalid')
857 effTitle = showId + u'-' + epTitle + u' part ' + compat_str(partNum+1)
862 'upload_date': officialDate,
867 'description': officialTitle,
874 class EscapistIE(InfoExtractor):
875 """Information extractor for The Escapist """
877 _VALID_URL = r'^(https?://)?(www\.)?escapistmagazine\.com/videos/view/(?P<showname>[^/]+)/(?P<episode>[^/?]+)[/?]?.*$'
878 IE_NAME = u'escapist'
880 def _real_extract(self, url):
881 mobj = re.match(self._VALID_URL, url)
883 raise ExtractorError(u'Invalid URL: %s' % url)
884 showName = mobj.group('showname')
885 videoId = mobj.group('episode')
887 self.report_extraction(videoId)
888 webpage = self._download_webpage(url, videoId)
890 videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
891 webpage, u'description', fatal=False)
893 imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
894 webpage, u'thumbnail', fatal=False)
896 playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
897 webpage, u'player url')
899 title = self._html_search_regex('<meta name="title" content="([^"]*)"',
900 webpage, u'player url').split(' : ')[-1]
902 configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
903 configUrl = compat_urllib_parse.unquote(configUrl)
905 configJSON = self._download_webpage(configUrl, videoId,
906 u'Downloading configuration',
907 u'unable to download configuration')
909 # Technically, it's JavaScript, not JSON
910 configJSON = configJSON.replace("'", '"')
913 config = json.loads(configJSON)
914 except (ValueError,) as err:
915 raise ExtractorError(u'Invalid JSON in configuration file: ' + compat_str(err))
917 playlist = config['playlist']
918 videoUrl = playlist[1]['url']
923 'uploader': showName,
928 'description': videoDesc,
929 'player_url': playerUrl,
934 class CollegeHumorIE(InfoExtractor):
935 """Information extractor for collegehumor.com"""
938 _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
939 IE_NAME = u'collegehumor'
941 def report_manifest(self, video_id):
942 """Report information extraction."""
943 self.to_screen(u'%s: Downloading XML manifest' % video_id)
945 def _real_extract(self, url):
946 mobj = re.match(self._VALID_URL, url)
948 raise ExtractorError(u'Invalid URL: %s' % url)
949 video_id = mobj.group('videoid')
957 self.report_extraction(video_id)
958 xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
960 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
961 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
962 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
964 mdoc = xml.etree.ElementTree.fromstring(metaXml)
966 videoNode = mdoc.findall('./video')[0]
967 info['description'] = videoNode.findall('./description')[0].text
968 info['title'] = videoNode.findall('./caption')[0].text
969 info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
970 manifest_url = videoNode.findall('./file')[0].text
972 raise ExtractorError(u'Invalid metadata XML file')
974 manifest_url += '?hdcore=2.10.3'
975 self.report_manifest(video_id)
977 manifestXml = compat_urllib_request.urlopen(manifest_url).read()
978 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
979 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
981 adoc = xml.etree.ElementTree.fromstring(manifestXml)
983 media_node = adoc.findall('./{http://ns.adobe.com/f4m/1.0}media')[0]
984 node_id = media_node.attrib['url']
985 video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text
986 except IndexError as err:
987 raise ExtractorError(u'Invalid manifest file')
989 url_pr = compat_urllib_parse_urlparse(manifest_url)
990 url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
997 class XVideosIE(InfoExtractor):
998 """Information extractor for xvideos.com"""
1000 _VALID_URL = r'^(?:https?://)?(?:www\.)?xvideos\.com/video([0-9]+)(?:.*)'
1001 IE_NAME = u'xvideos'
1003 def _real_extract(self, url):
1004 mobj = re.match(self._VALID_URL, url)
1006 raise ExtractorError(u'Invalid URL: %s' % url)
1007 video_id = mobj.group(1)
1009 webpage = self._download_webpage(url, video_id)
1011 self.report_extraction(video_id)
1014 video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
1015 webpage, u'video URL'))
1018 video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
1021 # Extract video thumbnail
1022 video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
1023 webpage, u'thumbnail', fatal=False)
1029 'upload_date': None,
1030 'title': video_title,
1032 'thumbnail': video_thumbnail,
1033 'description': None,
1039 class SoundcloudIE(InfoExtractor):
1040 """Information extractor for soundcloud.com
1041 To access the media, the uid of the song and a stream token
1042 must be extracted from the page source and the script must make
1043 a request to media.soundcloud.com/crossdomain.xml. Then
1044 the media can be grabbed by requesting from an url composed
1045 of the stream token and uid
1048 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)'
1049 IE_NAME = u'soundcloud'
1051 def report_resolve(self, video_id):
1052 """Report information extraction."""
1053 self.to_screen(u'%s: Resolving id' % video_id)
1055 def _real_extract(self, url):
1056 mobj = re.match(self._VALID_URL, url)
1058 raise ExtractorError(u'Invalid URL: %s' % url)
1060 # extract uploader (which is in the url)
1061 uploader = mobj.group(1)
1062 # extract simple title (uploader + slug of song title)
1063 slug_title = mobj.group(2)
1064 simple_title = uploader + u'-' + slug_title
1065 full_title = '%s/%s' % (uploader, slug_title)
1067 self.report_resolve(full_title)
1069 url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
1070 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1071 info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
1073 info = json.loads(info_json)
1074 video_id = info['id']
1075 self.report_extraction(full_title)
1077 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1078 stream_json = self._download_webpage(streams_url, full_title,
1079 u'Downloading stream definitions',
1080 u'unable to download stream definitions')
1082 streams = json.loads(stream_json)
1083 mediaURL = streams['http_mp3_128_url']
1084 upload_date = unified_strdate(info['created_at'])
1089 'uploader': info['user']['username'],
1090 'upload_date': upload_date,
1091 'title': info['title'],
1093 'description': info['description'],
1096 class SoundcloudSetIE(InfoExtractor):
1097 """Information extractor for soundcloud.com sets
1098 To access the media, the uid of the song and a stream token
1099 must be extracted from the page source and the script must make
1100 a request to media.soundcloud.com/crossdomain.xml. Then
1101 the media can be grabbed by requesting from an url composed
1102 of the stream token and uid
1105 _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
1106 IE_NAME = u'soundcloud:set'
1108 def report_resolve(self, video_id):
1109 """Report information extraction."""
1110 self.to_screen(u'%s: Resolving id' % video_id)
1112 def _real_extract(self, url):
1113 mobj = re.match(self._VALID_URL, url)
1115 raise ExtractorError(u'Invalid URL: %s' % url)
1117 # extract uploader (which is in the url)
1118 uploader = mobj.group(1)
1119 # extract simple title (uploader + slug of song title)
1120 slug_title = mobj.group(2)
1121 simple_title = uploader + u'-' + slug_title
1122 full_title = '%s/sets/%s' % (uploader, slug_title)
1124 self.report_resolve(full_title)
1126 url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
1127 resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1128 info_json = self._download_webpage(resolv_url, full_title)
1131 info = json.loads(info_json)
1132 if 'errors' in info:
1133 for err in info['errors']:
1134 self._downloader.report_error(u'unable to download video webpage: %s' % compat_str(err['error_message']))
1137 self.report_extraction(full_title)
1138 for track in info['tracks']:
1139 video_id = track['id']
1141 streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
1142 stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
1144 self.report_extraction(video_id)
1145 streams = json.loads(stream_json)
1146 mediaURL = streams['http_mp3_128_url']
1151 'uploader': track['user']['username'],
1152 'upload_date': unified_strdate(track['created_at']),
1153 'title': track['title'],
1155 'description': track['description'],
1160 class InfoQIE(InfoExtractor):
1161 """Information extractor for infoq.com"""
1162 _VALID_URL = r'^(?:https?://)?(?:www\.)?infoq\.com/[^/]+/[^/]+$'
1164 def _real_extract(self, url):
1165 mobj = re.match(self._VALID_URL, url)
1167 raise ExtractorError(u'Invalid URL: %s' % url)
1169 webpage = self._download_webpage(url, video_id=url)
1170 self.report_extraction(url)
1173 mobj = re.search(r"jsclassref ?= ?'([^']*)'", webpage)
1175 raise ExtractorError(u'Unable to extract video url')
1176 real_id = compat_urllib_parse.unquote(base64.b64decode(mobj.group(1).encode('ascii')).decode('utf-8'))
1177 video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
1180 video_title = self._search_regex(r'contentTitle = "(.*?)";',
1183 # Extract description
1184 video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
1185 webpage, u'description', fatal=False)
1187 video_filename = video_url.split('/')[-1]
1188 video_id, extension = video_filename.split('.')
1194 'upload_date': None,
1195 'title': video_title,
1196 'ext': extension, # Extension is always(?) mp4, but seems to be flv
1198 'description': video_description,
1203 class MixcloudIE(InfoExtractor):
1204 """Information extractor for www.mixcloud.com"""
1206 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
1207 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
1208 IE_NAME = u'mixcloud'
1210 def report_download_json(self, file_id):
1211 """Report JSON download."""
1212 self.to_screen(u'Downloading json')
1214 def get_urls(self, jsonData, fmt, bitrate='best'):
1215 """Get urls from 'audio_formats' section in json"""
1218 bitrate_list = jsonData[fmt]
1219 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
1220 bitrate = max(bitrate_list) # select highest
1222 url_list = jsonData[fmt][bitrate]
1223 except TypeError: # we have no bitrate info.
1224 url_list = jsonData[fmt]
1227 def check_urls(self, url_list):
1228 """Returns 1st active url from list"""
1229 for url in url_list:
1231 compat_urllib_request.urlopen(url)
1233 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1238 def _print_formats(self, formats):
1239 print('Available formats:')
1240 for fmt in formats.keys():
1241 for b in formats[fmt]:
1243 ext = formats[fmt][b][0]
1244 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
1245 except TypeError: # we have no bitrate info
1246 ext = formats[fmt][0]
1247 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
1250 def _real_extract(self, url):
1251 mobj = re.match(self._VALID_URL, url)
1253 raise ExtractorError(u'Invalid URL: %s' % url)
1254 # extract uploader & filename from url
1255 uploader = mobj.group(1).decode('utf-8')
1256 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
1258 # construct API request
1259 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
1260 # retrieve .json file with links to files
1261 request = compat_urllib_request.Request(file_url)
1263 self.report_download_json(file_url)
1264 jsonData = compat_urllib_request.urlopen(request).read()
1265 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1266 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
1269 json_data = json.loads(jsonData)
1270 player_url = json_data['player_swf_url']
1271 formats = dict(json_data['audio_formats'])
1273 req_format = self._downloader.params.get('format', None)
1276 if self._downloader.params.get('listformats', None):
1277 self._print_formats(formats)
1280 if req_format is None or req_format == 'best':
1281 for format_param in formats.keys():
1282 url_list = self.get_urls(formats, format_param)
1284 file_url = self.check_urls(url_list)
1285 if file_url is not None:
1288 if req_format not in formats:
1289 raise ExtractorError(u'Format is not available')
1291 url_list = self.get_urls(formats, req_format)
1292 file_url = self.check_urls(url_list)
1293 format_param = req_format
1296 'id': file_id.decode('utf-8'),
1297 'url': file_url.decode('utf-8'),
1298 'uploader': uploader.decode('utf-8'),
1299 'upload_date': None,
1300 'title': json_data['name'],
1301 'ext': file_url.split('.')[-1].decode('utf-8'),
1302 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
1303 'thumbnail': json_data['thumbnail_url'],
1304 'description': json_data['description'],
1305 'player_url': player_url.decode('utf-8'),
1308 class StanfordOpenClassroomIE(InfoExtractor):
1309 """Information extractor for Stanford's Open ClassRoom"""
1311 _VALID_URL = r'^(?:https?://)?openclassroom.stanford.edu(?P<path>/?|(/MainFolder/(?:HomePage|CoursePage|VideoPage)\.php([?]course=(?P<course>[^&]+)(&video=(?P<video>[^&]+))?(&.*)?)?))$'
1312 IE_NAME = u'stanfordoc'
1314 def _real_extract(self, url):
1315 mobj = re.match(self._VALID_URL, url)
1317 raise ExtractorError(u'Invalid URL: %s' % url)
1319 if mobj.group('course') and mobj.group('video'): # A specific video
1320 course = mobj.group('course')
1321 video = mobj.group('video')
1323 'id': course + '_' + video,
1325 'upload_date': None,
1328 self.report_extraction(info['id'])
1329 baseUrl = 'http://openclassroom.stanford.edu/MainFolder/courses/' + course + '/videos/'
1330 xmlUrl = baseUrl + video + '.xml'
1332 metaXml = compat_urllib_request.urlopen(xmlUrl).read()
1333 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1334 raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
1335 mdoc = xml.etree.ElementTree.fromstring(metaXml)
1337 info['title'] = mdoc.findall('./title')[0].text
1338 info['url'] = baseUrl + mdoc.findall('./videoFile')[0].text
1340 raise ExtractorError(u'Invalid metadata XML file')
1341 info['ext'] = info['url'].rpartition('.')[2]
1343 elif mobj.group('course'): # A course page
1344 course = mobj.group('course')
1349 'upload_date': None,
1352 coursepage = self._download_webpage(url, info['id'],
1353 note='Downloading course info page',
1354 errnote='Unable to download course info page')
1356 info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
1358 info['description'] = self._html_search_regex('<description>([^<]+)</description>',
1359 coursepage, u'description', fatal=False)
1361 links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
1364 'type': 'reference',
1365 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(vpage),
1369 for entry in info['list']:
1370 assert entry['type'] == 'reference'
1371 results += self.extract(entry['url'])
1375 'id': 'Stanford OpenClassroom',
1378 'upload_date': None,
1381 self.report_download_webpage(info['id'])
1382 rootURL = 'http://openclassroom.stanford.edu/MainFolder/HomePage.php'
1384 rootpage = compat_urllib_request.urlopen(rootURL).read()
1385 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1386 raise ExtractorError(u'Unable to download course info page: ' + compat_str(err))
1388 info['title'] = info['id']
1390 links = orderedSet(re.findall('<a href="(CoursePage.php\?[^"]+)">', rootpage))
1393 'type': 'reference',
1394 'url': 'http://openclassroom.stanford.edu/MainFolder/' + unescapeHTML(cpage),
1399 for entry in info['list']:
1400 assert entry['type'] == 'reference'
1401 results += self.extract(entry['url'])
1404 class MTVIE(InfoExtractor):
1405 """Information extractor for MTV.com"""
1407 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
1410 def _real_extract(self, url):
1411 mobj = re.match(self._VALID_URL, url)
1413 raise ExtractorError(u'Invalid URL: %s' % url)
1414 if not mobj.group('proto'):
1415 url = 'http://' + url
1416 video_id = mobj.group('videoid')
1418 webpage = self._download_webpage(url, video_id)
1420 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
1421 webpage, u'song name', fatal=False)
1423 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
1426 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
1427 webpage, u'mtvn_uri', fatal=False)
1429 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
1430 webpage, u'content id', fatal=False)
1432 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
1433 self.report_extraction(video_id)
1434 request = compat_urllib_request.Request(videogen_url)
1436 metadataXml = compat_urllib_request.urlopen(request).read()
1437 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
1438 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
1440 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
1441 renditions = mdoc.findall('.//rendition')
1443 # For now, always pick the highest quality.
1444 rendition = renditions[-1]
1447 _,_,ext = rendition.attrib['type'].partition('/')
1448 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
1449 video_url = rendition.find('./src').text
1451 raise ExtractorError('Invalid rendition field.')
1456 'uploader': performer,
1457 'upload_date': None,
1458 'title': video_title,
1466 class YoukuIE(InfoExtractor):
1467 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
1470 nowTime = int(time.time() * 1000)
1471 random1 = random.randint(1000,1998)
1472 random2 = random.randint(1000,9999)
1474 return "%d%d%d" %(nowTime,random1,random2)
1476 def _get_file_ID_mix_string(self, seed):
1478 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
1480 for i in range(len(source)):
1481 seed = (seed * 211 + 30031 ) % 65536
1482 index = math.floor(seed / 65536 * len(source) )
1483 mixed.append(source[int(index)])
1484 source.remove(source[int(index)])
1485 #return ''.join(mixed)
1488 def _get_file_id(self, fileId, seed):
1489 mixed = self._get_file_ID_mix_string(seed)
1490 ids = fileId.split('*')
1494 realId.append(mixed[int(ch)])
1495 return ''.join(realId)
1497 def _real_extract(self, url):
1498 mobj = re.match(self._VALID_URL, url)
1500 raise ExtractorError(u'Invalid URL: %s' % url)
1501 video_id = mobj.group('ID')
1503 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
1505 jsondata = self._download_webpage(info_url, video_id)
1507 self.report_extraction(video_id)
1509 config = json.loads(jsondata)
1511 video_title = config['data'][0]['title']
1512 seed = config['data'][0]['seed']
1514 format = self._downloader.params.get('format', None)
1515 supported_format = list(config['data'][0]['streamfileids'].keys())
1517 if format is None or format == 'best':
1518 if 'hd2' in supported_format:
1523 elif format == 'worst':
1531 fileid = config['data'][0]['streamfileids'][format]
1532 keys = [s['k'] for s in config['data'][0]['segs'][format]]
1533 except (UnicodeDecodeError, ValueError, KeyError):
1534 raise ExtractorError(u'Unable to extract info section')
1537 sid = self._gen_sid()
1538 fileid = self._get_file_id(fileid, seed)
1540 #column 8,9 of fileid represent the segment number
1541 #fileid[7:9] should be changed
1542 for index, key in enumerate(keys):
1544 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
1545 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
1548 'id': '%s_part%02d' % (video_id, index),
1549 'url': download_url,
1551 'upload_date': None,
1552 'title': video_title,
1555 files_info.append(info)
1560 class XNXXIE(InfoExtractor):
1561 """Information extractor for xnxx.com"""
1563 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
1565 VIDEO_URL_RE = r'flv_url=(.*?)&'
1566 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
1567 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
1569 def _real_extract(self, url):
1570 mobj = re.match(self._VALID_URL, url)
1572 raise ExtractorError(u'Invalid URL: %s' % url)
1573 video_id = mobj.group(1)
1575 # Get webpage content
1576 webpage = self._download_webpage(url, video_id)
1578 video_url = self._search_regex(self.VIDEO_URL_RE,
1579 webpage, u'video URL')
1580 video_url = compat_urllib_parse.unquote(video_url)
1582 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
1585 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
1586 webpage, u'thumbnail', fatal=False)
1592 'upload_date': None,
1593 'title': video_title,
1595 'thumbnail': video_thumbnail,
1596 'description': None,
1600 class GooglePlusIE(InfoExtractor):
1601 """Information extractor for plus.google.com."""
1603 _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
1604 IE_NAME = u'plus.google'
1606 def _real_extract(self, url):
1607 # Extract id from URL
1608 mobj = re.match(self._VALID_URL, url)
1610 raise ExtractorError(u'Invalid URL: %s' % url)
1612 post_url = mobj.group(0)
1613 video_id = mobj.group(1)
1615 video_extension = 'flv'
1617 # Step 1, Retrieve post webpage to extract further information
1618 webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
1620 self.report_extraction(video_id)
1622 # Extract update date
1623 upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
1624 webpage, u'upload date', fatal=False)
1626 # Convert timestring to a format suitable for filename
1627 upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
1628 upload_date = upload_date.strftime('%Y%m%d')
1631 uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
1632 webpage, u'uploader', fatal=False)
1635 # Get the first line for title
1636 video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
1637 webpage, 'title', default=u'NA')
1639 # Step 2, Stimulate clicking the image box to launch video
1640 video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
1641 webpage, u'video page URL')
1642 webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
1644 # Extract video links on video page
1645 """Extract video links of all sizes"""
1646 pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
1647 mobj = re.findall(pattern, webpage)
1649 raise ExtractorError(u'Unable to extract video links')
1651 # Sort in resolution
1652 links = sorted(mobj)
1654 # Choose the lowest of the sort, i.e. highest resolution
1655 video_url = links[-1]
1656 # Only get the url. The resolution part in the tuple has no use anymore
1657 video_url = video_url[-1]
1658 # Treat escaped \u0026 style hex
1660 video_url = video_url.decode("unicode_escape")
1661 except AttributeError: # Python 3
1662 video_url = bytes(video_url, 'ascii').decode('unicode-escape')
1668 'uploader': uploader,
1669 'upload_date': upload_date,
1670 'title': video_title,
1671 'ext': video_extension,
1674 class NBAIE(InfoExtractor):
1675 _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
1678 def _real_extract(self, url):
1679 mobj = re.match(self._VALID_URL, url)
1681 raise ExtractorError(u'Invalid URL: %s' % url)
1683 video_id = mobj.group(1)
1685 webpage = self._download_webpage(url, video_id)
1687 video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
1689 shortened_video_id = video_id.rpartition('/')[2]
1690 title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
1691 webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
1693 # It isn't there in the HTML it returns to us
1694 # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
1696 description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
1699 'id': shortened_video_id,
1703 # 'uploader_date': uploader_date,
1704 'description': description,
1708 class JustinTVIE(InfoExtractor):
1709 """Information extractor for justin.tv and twitch.tv"""
1710 # TODO: One broadcast may be split into multiple videos. The key
1711 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
1712 # starts at 1 and increases. Can we treat all parts as one video?
1714 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
1716 (?P<channelid>[^/]+)|
1717 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
1718 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
1722 _JUSTIN_PAGE_LIMIT = 100
1723 IE_NAME = u'justin.tv'
1725 def report_download_page(self, channel, offset):
1726 """Report attempt to download a single page of videos."""
1727 self.to_screen(u'%s: Downloading video information from %d to %d' %
1728 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
1730 # Return count of items, list of *valid* items
1731 def _parse_page(self, url, video_id):
1732 webpage = self._download_webpage(url, video_id,
1733 u'Downloading video info JSON',
1734 u'unable to download video info JSON')
1736 response = json.loads(webpage)
1737 if type(response) != list:
1738 error_text = response.get('error', 'unknown error')
1739 raise ExtractorError(u'Justin.tv API: %s' % error_text)
1741 for clip in response:
1742 video_url = clip['video_file_url']
1744 video_extension = os.path.splitext(video_url)[1][1:]
1745 video_date = re.sub('-', '', clip['start_time'][:10])
1746 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
1747 video_id = clip['id']
1748 video_title = clip.get('title', video_id)
1752 'title': video_title,
1753 'uploader': clip.get('channel_name', video_uploader_id),
1754 'uploader_id': video_uploader_id,
1755 'upload_date': video_date,
1756 'ext': video_extension,
1758 return (len(response), info)
1760 def _real_extract(self, url):
1761 mobj = re.match(self._VALID_URL, url)
1763 raise ExtractorError(u'invalid URL: %s' % url)
1765 api_base = 'http://api.justin.tv'
1767 if mobj.group('channelid'):
1769 video_id = mobj.group('channelid')
1770 api = api_base + '/channel/archives/%s.json' % video_id
1771 elif mobj.group('chapterid'):
1772 chapter_id = mobj.group('chapterid')
1774 webpage = self._download_webpage(url, chapter_id)
1775 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
1777 raise ExtractorError(u'Cannot find archive of a chapter')
1778 archive_id = m.group(1)
1780 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
1781 chapter_info_xml = self._download_webpage(api, chapter_id,
1782 note=u'Downloading chapter information',
1783 errnote=u'Chapter information download failed')
1784 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
1785 for a in doc.findall('.//archive'):
1786 if archive_id == a.find('./id').text:
1789 raise ExtractorError(u'Could not find chapter in chapter information')
1791 video_url = a.find('./video_file_url').text
1792 video_ext = video_url.rpartition('.')[2] or u'flv'
1794 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
1795 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
1796 note='Downloading chapter metadata',
1797 errnote='Download of chapter metadata failed')
1798 chapter_info = json.loads(chapter_info_json)
1800 bracket_start = int(doc.find('.//bracket_start').text)
1801 bracket_end = int(doc.find('.//bracket_end').text)
1803 # TODO determine start (and probably fix up file)
1804 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
1805 #video_url += u'?start=' + TODO:start_timestamp
1806 # bracket_start is 13290, but we want 51670615
1807 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
1808 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
1811 'id': u'c' + chapter_id,
1814 'title': chapter_info['title'],
1815 'thumbnail': chapter_info['preview'],
1816 'description': chapter_info['description'],
1817 'uploader': chapter_info['channel']['display_name'],
1818 'uploader_id': chapter_info['channel']['name'],
1822 video_id = mobj.group('videoid')
1823 api = api_base + '/broadcast/by_archive/%s.json' % video_id
1825 self.report_extraction(video_id)
1829 limit = self._JUSTIN_PAGE_LIMIT
1832 self.report_download_page(video_id, offset)
1833 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
1834 page_count, page_info = self._parse_page(page_url, video_id)
1835 info.extend(page_info)
1836 if not paged or page_count != limit:
1841 class FunnyOrDieIE(InfoExtractor):
1842 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
1844 def _real_extract(self, url):
1845 mobj = re.match(self._VALID_URL, url)
1847 raise ExtractorError(u'invalid URL: %s' % url)
1849 video_id = mobj.group('id')
1850 webpage = self._download_webpage(url, video_id)
1852 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
1853 webpage, u'video URL', flags=re.DOTALL)
1855 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
1856 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
1858 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1859 webpage, u'description', fatal=False, flags=re.DOTALL)
1866 'description': video_description,
1870 class SteamIE(InfoExtractor):
1871 _VALID_URL = r"""http://store\.steampowered\.com/
1873 (?P<urltype>video|app)/ #If the page is only for videos or for a game
1875 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
1877 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
1878 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
1881 def suitable(cls, url):
1882 """Receives a URL and returns True if suitable for this IE."""
1883 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
1885 def _real_extract(self, url):
1886 m = re.match(self._VALID_URL, url, re.VERBOSE)
1887 gameID = m.group('gameID')
1889 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
1890 webpage = self._download_webpage(videourl, gameID)
1892 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
1893 videourl = self._AGECHECK_TEMPLATE % gameID
1894 self.report_age_confirmation()
1895 webpage = self._download_webpage(videourl, gameID)
1897 self.report_extraction(gameID)
1898 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
1899 webpage, 'game title')
1901 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
1902 mweb = re.finditer(urlRE, webpage)
1903 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
1904 titles = re.finditer(namesRE, webpage)
1905 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
1906 thumbs = re.finditer(thumbsRE, webpage)
1908 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
1909 video_id = vid.group('videoID')
1910 title = vtitle.group('videoName')
1911 video_url = vid.group('videoURL')
1912 video_thumb = thumb.group('thumbnail')
1914 raise ExtractorError(u'Cannot find video url for %s' % video_id)
1919 'title': unescapeHTML(title),
1920 'thumbnail': video_thumb
1923 return [self.playlist_result(videos, gameID, game_title)]
1925 class UstreamIE(InfoExtractor):
1926 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
1927 IE_NAME = u'ustream'
1929 def _real_extract(self, url):
1930 m = re.match(self._VALID_URL, url)
1931 video_id = m.group('videoID')
1933 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
1934 webpage = self._download_webpage(url, video_id)
1936 self.report_extraction(video_id)
1938 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
1941 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
1942 webpage, u'uploader', fatal=False, flags=re.DOTALL)
1944 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
1945 webpage, u'thumbnail', fatal=False)
1951 'title': video_title,
1952 'uploader': uploader,
1953 'thumbnail': thumbnail,
1957 class WorldStarHipHopIE(InfoExtractor):
1958 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
1959 IE_NAME = u'WorldStarHipHop'
1961 def _real_extract(self, url):
1962 m = re.match(self._VALID_URL, url)
1963 video_id = m.group('id')
1965 webpage_src = self._download_webpage(url, video_id)
1967 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
1968 webpage_src, u'video URL')
1970 if 'mp4' in video_url:
1975 video_title = self._html_search_regex(r"<title>(.*)</title>",
1976 webpage_src, u'title')
1978 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
1979 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
1980 webpage_src, u'thumbnail', fatal=False)
1983 _title = r"""candytitles.*>(.*)</span>"""
1984 mobj = re.search(_title, webpage_src)
1985 if mobj is not None:
1986 video_title = mobj.group(1)
1991 'title' : video_title,
1992 'thumbnail' : thumbnail,
1997 class RBMARadioIE(InfoExtractor):
1998 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
2000 def _real_extract(self, url):
2001 m = re.match(self._VALID_URL, url)
2002 video_id = m.group('videoID')
2004 webpage = self._download_webpage(url, video_id)
2006 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
2007 webpage, u'json data', flags=re.MULTILINE)
2010 data = json.loads(json_data)
2011 except ValueError as e:
2012 raise ExtractorError(u'Invalid JSON: ' + str(e))
2014 video_url = data['akamai_url'] + '&cbr=256'
2015 url_parts = compat_urllib_parse_urlparse(video_url)
2016 video_ext = url_parts.path.rpartition('.')[2]
2021 'title': data['title'],
2022 'description': data.get('teaser_text'),
2023 'location': data.get('country_of_origin'),
2024 'uploader': data.get('host', {}).get('name'),
2025 'uploader_id': data.get('host', {}).get('slug'),
2026 'thumbnail': data.get('image', {}).get('large_url_2x'),
2027 'duration': data.get('duration'),
2032 class YouPornIE(InfoExtractor):
2033 """Information extractor for youporn.com."""
2034 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
2036 def _print_formats(self, formats):
2037 """Print all available formats"""
2038 print(u'Available formats:')
2039 print(u'ext\t\tformat')
2040 print(u'---------------------------------')
2041 for format in formats:
2042 print(u'%s\t\t%s' % (format['ext'], format['format']))
2044 def _specific(self, req_format, formats):
2046 if(x["format"]==req_format):
2050 def _real_extract(self, url):
2051 mobj = re.match(self._VALID_URL, url)
2053 raise ExtractorError(u'Invalid URL: %s' % url)
2054 video_id = mobj.group('videoid')
2056 req = compat_urllib_request.Request(url)
2057 req.add_header('Cookie', 'age_verified=1')
2058 webpage = self._download_webpage(req, video_id)
2060 # Get JSON parameters
2061 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
2063 params = json.loads(json_params)
2065 raise ExtractorError(u'Invalid JSON')
2067 self.report_extraction(video_id)
2069 video_title = params['title']
2070 upload_date = unified_strdate(params['release_date_f'])
2071 video_description = params['description']
2072 video_uploader = params['submitted_by']
2073 thumbnail = params['thumbnails'][0]['image']
2075 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
2077 # Get all of the formats available
2078 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
2079 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
2080 webpage, u'download list').strip()
2082 # Get all of the links from the page
2083 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
2084 links = re.findall(LINK_RE, download_list_html)
2085 if(len(links) == 0):
2086 raise ExtractorError(u'ERROR: no known formats available for video')
2088 self.to_screen(u'Links found: %d' % len(links))
2093 # A link looks like this:
2094 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
2095 # A path looks like this:
2096 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
2097 video_url = unescapeHTML( link )
2098 path = compat_urllib_parse_urlparse( video_url ).path
2099 extension = os.path.splitext( path )[1][1:]
2100 format = path.split('/')[4].split('_')[:2]
2103 format = "-".join( format )
2104 # title = u'%s-%s-%s' % (video_title, size, bitrate)
2109 'uploader': video_uploader,
2110 'upload_date': upload_date,
2111 'title': video_title,
2114 'thumbnail': thumbnail,
2115 'description': video_description
2118 if self._downloader.params.get('listformats', None):
2119 self._print_formats(formats)
2122 req_format = self._downloader.params.get('format', None)
2123 self.to_screen(u'Format: %s' % req_format)
2125 if req_format is None or req_format == 'best':
2127 elif req_format == 'worst':
2128 return [formats[-1]]
2129 elif req_format in ('-1', 'all'):
2132 format = self._specific( req_format, formats )
2134 raise ExtractorError(u'Requested format not available')
2139 class PornotubeIE(InfoExtractor):
2140 """Information extractor for pornotube.com."""
2141 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
2143 def _real_extract(self, url):
2144 mobj = re.match(self._VALID_URL, url)
2146 raise ExtractorError(u'Invalid URL: %s' % url)
2148 video_id = mobj.group('videoid')
2149 video_title = mobj.group('title')
2151 # Get webpage content
2152 webpage = self._download_webpage(url, video_id)
2155 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
2156 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
2157 video_url = compat_urllib_parse.unquote(video_url)
2159 #Get the uploaded date
2160 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
2161 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
2162 if upload_date: upload_date = unified_strdate(upload_date)
2164 info = {'id': video_id,
2167 'upload_date': upload_date,
2168 'title': video_title,
2174 class YouJizzIE(InfoExtractor):
2175 """Information extractor for youjizz.com."""
2176 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
2178 def _real_extract(self, url):
2179 mobj = re.match(self._VALID_URL, url)
2181 raise ExtractorError(u'Invalid URL: %s' % url)
2183 video_id = mobj.group('videoid')
2185 # Get webpage content
2186 webpage = self._download_webpage(url, video_id)
2188 # Get the video title
2189 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
2190 webpage, u'title').strip()
2192 # Get the embed page
2193 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
2195 raise ExtractorError(u'ERROR: unable to extract embed page')
2197 embed_page_url = result.group(0).strip()
2198 video_id = result.group('videoid')
2200 webpage = self._download_webpage(embed_page_url, video_id)
2203 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
2204 webpage, u'video URL')
2206 info = {'id': video_id,
2208 'title': video_title,
2211 'player_url': embed_page_url}
2215 class EightTracksIE(InfoExtractor):
2217 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
2219 def _real_extract(self, url):
2220 mobj = re.match(self._VALID_URL, url)
2222 raise ExtractorError(u'Invalid URL: %s' % url)
2223 playlist_id = mobj.group('id')
2225 webpage = self._download_webpage(url, playlist_id)
2227 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
2228 data = json.loads(json_like)
2230 session = str(random.randint(0, 1000000000))
2232 track_count = data['tracks_count']
2233 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
2234 next_url = first_url
2236 for i in itertools.count():
2237 api_json = self._download_webpage(next_url, playlist_id,
2238 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
2239 errnote=u'Failed to download song information')
2240 api_data = json.loads(api_json)
2241 track_data = api_data[u'set']['track']
2243 'id': track_data['id'],
2244 'url': track_data['track_file_stream_url'],
2245 'title': track_data['performer'] + u' - ' + track_data['name'],
2246 'raw_title': track_data['name'],
2247 'uploader_id': data['user']['login'],
2251 if api_data['set']['at_last_track']:
2253 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
2256 class KeekIE(InfoExtractor):
2257 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
2260 def _real_extract(self, url):
2261 m = re.match(self._VALID_URL, url)
2262 video_id = m.group('videoID')
2264 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
2265 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
2266 webpage = self._download_webpage(url, video_id)
2268 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2271 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
2272 webpage, u'uploader', fatal=False)
2278 'title': video_title,
2279 'thumbnail': thumbnail,
2280 'uploader': uploader
2284 class TEDIE(InfoExtractor):
2285 _VALID_URL=r'''http://www\.ted\.com/
2287 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
2289 ((?P<type_talk>talks)) # We have a simple talk
2291 (/lang/(.*?))? # The url may contain the language
2292 /(?P<name>\w+) # Here goes the name and then ".html"
2296 def suitable(cls, url):
2297 """Receives a URL and returns True if suitable for this IE."""
2298 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
2300 def _real_extract(self, url):
2301 m=re.match(self._VALID_URL, url, re.VERBOSE)
2302 if m.group('type_talk'):
2303 return [self._talk_info(url)]
2305 playlist_id=m.group('playlist_id')
2306 name=m.group('name')
2307 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
2308 return [self._playlist_videos_info(url,name,playlist_id)]
2310 def _playlist_videos_info(self,url,name,playlist_id=0):
2311 '''Returns the videos of the playlist'''
2313 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
2314 ([.\s]*?)data-playlist_item_id="(\d+)"
2315 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
2317 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
2318 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
2319 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
2320 m_names=re.finditer(video_name_RE,webpage)
2322 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
2323 webpage, 'playlist title')
2325 playlist_entries = []
2326 for m_video, m_name in zip(m_videos,m_names):
2327 video_id=m_video.group('video_id')
2328 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
2329 playlist_entries.append(self.url_result(talk_url, 'TED'))
2330 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
2332 def _talk_info(self, url, video_id=0):
2333 """Return the video for the talk in the url"""
2334 m = re.match(self._VALID_URL, url,re.VERBOSE)
2335 video_name = m.group('name')
2336 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
2337 self.report_extraction(video_name)
2338 # If the url includes the language we get the title translated
2339 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
2341 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
2342 webpage, 'json data')
2343 info = json.loads(json_data)
2344 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
2345 webpage, 'description', flags = re.DOTALL)
2347 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
2348 webpage, 'thumbnail')
2351 'url': info['htmlStreams'][-1]['file'],
2354 'thumbnail': thumbnail,
2355 'description': desc,
2359 class MySpassIE(InfoExtractor):
2360 _VALID_URL = r'http://www.myspass.de/.*'
2362 def _real_extract(self, url):
2363 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
2365 # video id is the last path element of the URL
2366 # usually there is a trailing slash, so also try the second but last
2367 url_path = compat_urllib_parse_urlparse(url).path
2368 url_parent_path, video_id = os.path.split(url_path)
2370 _, video_id = os.path.split(url_parent_path)
2373 metadata_url = META_DATA_URL_TEMPLATE % video_id
2374 metadata_text = self._download_webpage(metadata_url, video_id)
2375 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
2377 # extract values from metadata
2378 url_flv_el = metadata.find('url_flv')
2379 if url_flv_el is None:
2380 raise ExtractorError(u'Unable to extract download url')
2381 video_url = url_flv_el.text
2382 extension = os.path.splitext(video_url)[1][1:]
2383 title_el = metadata.find('title')
2384 if title_el is None:
2385 raise ExtractorError(u'Unable to extract title')
2386 title = title_el.text
2387 format_id_el = metadata.find('format_id')
2388 if format_id_el is None:
2391 format = format_id_el.text
2392 description_el = metadata.find('description')
2393 if description_el is not None:
2394 description = description_el.text
2397 imagePreview_el = metadata.find('imagePreview')
2398 if imagePreview_el is not None:
2399 thumbnail = imagePreview_el.text
2408 'thumbnail': thumbnail,
2409 'description': description
2413 class SpiegelIE(InfoExtractor):
2414 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
2416 def _real_extract(self, url):
2417 m = re.match(self._VALID_URL, url)
2418 video_id = m.group('videoID')
2420 webpage = self._download_webpage(url, video_id)
2422 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
2425 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
2426 xml_code = self._download_webpage(xml_url, video_id,
2427 note=u'Downloading XML', errnote=u'Failed to download XML')
2429 idoc = xml.etree.ElementTree.fromstring(xml_code)
2430 last_type = idoc[-1]
2431 filename = last_type.findall('./filename')[0].text
2432 duration = float(last_type.findall('./duration')[0].text)
2434 video_url = 'http://video2.spiegel.de/flash/' + filename
2435 video_ext = filename.rpartition('.')[2]
2440 'title': video_title,
2441 'duration': duration,
2445 class LiveLeakIE(InfoExtractor):
2447 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
2448 IE_NAME = u'liveleak'
2450 def _real_extract(self, url):
2451 mobj = re.match(self._VALID_URL, url)
2453 raise ExtractorError(u'Invalid URL: %s' % url)
2455 video_id = mobj.group('video_id')
2457 webpage = self._download_webpage(url, video_id)
2459 video_url = self._search_regex(r'file: "(.*?)",',
2460 webpage, u'video URL')
2462 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
2463 webpage, u'title').replace('LiveLeak.com -', '').strip()
2465 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
2466 webpage, u'description', fatal=False)
2468 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
2469 webpage, u'uploader', fatal=False)
2475 'title': video_title,
2476 'description': video_description,
2477 'uploader': video_uploader
2484 class TumblrIE(InfoExtractor):
2485 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
2487 def _real_extract(self, url):
2488 m_url = re.match(self._VALID_URL, url)
2489 video_id = m_url.group('id')
2490 blog = m_url.group('blog_name')
2492 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
2493 webpage = self._download_webpage(url, video_id)
2495 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
2496 video = re.search(re_video, webpage)
2498 raise ExtractorError(u'Unable to extract video')
2499 video_url = video.group('video_url')
2500 ext = video.group('ext')
2502 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
2503 webpage, u'thumbnail', fatal=False) # We pick the first poster
2504 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
2506 # The only place where you can get a title, it's not complete,
2507 # but searching in other places doesn't work for all videos
2508 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
2509 webpage, u'title', flags=re.DOTALL)
2511 return [{'id': video_id,
2513 'title': video_title,
2514 'thumbnail': video_thumbnail,
2518 class BandcampIE(InfoExtractor):
2519 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
2521 def _real_extract(self, url):
2522 mobj = re.match(self._VALID_URL, url)
2523 title = mobj.group('title')
2524 webpage = self._download_webpage(url, title)
2525 # We get the link to the free download page
2526 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
2527 if m_download is None:
2528 raise ExtractorError(u'No free songs found')
2530 download_link = m_download.group(1)
2531 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
2532 webpage, re.MULTILINE|re.DOTALL).group('id')
2534 download_webpage = self._download_webpage(download_link, id,
2535 'Downloading free downloads page')
2536 # We get the dictionary of the track from some javascrip code
2537 info = re.search(r'items: (.*?),$',
2538 download_webpage, re.MULTILINE).group(1)
2539 info = json.loads(info)[0]
2540 # We pick mp3-320 for now, until format selection can be easily implemented.
2541 mp3_info = info[u'downloads'][u'mp3-320']
2542 # If we try to use this url it says the link has expired
2543 initial_url = mp3_info[u'url']
2544 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
2545 m_url = re.match(re_url, initial_url)
2546 #We build the url we will use to get the final track url
2547 # This url is build in Bandcamp in the script download_bunde_*.js
2548 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
2549 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
2550 # If we could correctly generate the .rand field the url would be
2551 #in the "download_url" key
2552 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
2554 track_info = {'id':id,
2555 'title' : info[u'title'],
2558 'thumbnail' : info[u'thumb_url'],
2559 'uploader' : info[u'artist']
2564 class RedTubeIE(InfoExtractor):
2565 """Information Extractor for redtube"""
2566 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
2568 def _real_extract(self,url):
2569 mobj = re.match(self._VALID_URL, url)
2571 raise ExtractorError(u'Invalid URL: %s' % url)
2573 video_id = mobj.group('id')
2574 video_extension = 'mp4'
2575 webpage = self._download_webpage(url, video_id)
2577 self.report_extraction(video_id)
2579 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
2580 webpage, u'video URL')
2582 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
2588 'ext': video_extension,
2589 'title': video_title,
2592 class InaIE(InfoExtractor):
2593 """Information Extractor for Ina.fr"""
2594 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
2596 def _real_extract(self,url):
2597 mobj = re.match(self._VALID_URL, url)
2599 video_id = mobj.group('id')
2600 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
2601 video_extension = 'mp4'
2602 webpage = self._download_webpage(mrss_url, video_id)
2604 self.report_extraction(video_id)
2606 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
2607 webpage, u'video URL')
2609 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
2615 'ext': video_extension,
2616 'title': video_title,
2619 class HowcastIE(InfoExtractor):
2620 """Information Extractor for Howcast.com"""
2621 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
2623 def _real_extract(self, url):
2624 mobj = re.match(self._VALID_URL, url)
2626 video_id = mobj.group('id')
2627 webpage_url = 'http://www.howcast.com/videos/' + video_id
2628 webpage = self._download_webpage(webpage_url, video_id)
2630 self.report_extraction(video_id)
2632 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
2633 webpage, u'video URL')
2635 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
2638 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
2639 webpage, u'description', fatal=False)
2641 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
2642 webpage, u'thumbnail', fatal=False)
2648 'title': video_title,
2649 'description': video_description,
2650 'thumbnail': thumbnail,
2653 class VineIE(InfoExtractor):
2654 """Information Extractor for Vine.co"""
2655 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
2657 def _real_extract(self, url):
2658 mobj = re.match(self._VALID_URL, url)
2660 video_id = mobj.group('id')
2661 webpage_url = 'https://vine.co/v/' + video_id
2662 webpage = self._download_webpage(webpage_url, video_id)
2664 self.report_extraction(video_id)
2666 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
2667 webpage, u'video URL')
2669 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2672 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
2673 webpage, u'thumbnail', fatal=False)
2675 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
2676 webpage, u'uploader', fatal=False, flags=re.DOTALL)
2682 'title': video_title,
2683 'thumbnail': thumbnail,
2684 'uploader': uploader,
2687 class FlickrIE(InfoExtractor):
2688 """Information Extractor for Flickr videos"""
2689 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
2691 def _real_extract(self, url):
2692 mobj = re.match(self._VALID_URL, url)
2694 video_id = mobj.group('id')
2695 video_uploader_id = mobj.group('uploader_id')
2696 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
2697 webpage = self._download_webpage(webpage_url, video_id)
2699 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
2701 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
2702 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
2704 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
2705 first_xml, u'node_id')
2707 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
2708 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
2710 self.report_extraction(video_id)
2712 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
2714 raise ExtractorError(u'Unable to extract video url')
2715 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
2717 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
2718 webpage, u'video title')
2720 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
2721 webpage, u'description', fatal=False)
2723 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
2724 webpage, u'thumbnail', fatal=False)
2730 'title': video_title,
2731 'description': video_description,
2732 'thumbnail': thumbnail,
2733 'uploader_id': video_uploader_id,
2736 class TeamcocoIE(InfoExtractor):
2737 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
2739 def _real_extract(self, url):
2740 mobj = re.match(self._VALID_URL, url)
2742 raise ExtractorError(u'Invalid URL: %s' % url)
2743 url_title = mobj.group('url_title')
2744 webpage = self._download_webpage(url, url_title)
2746 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
2747 webpage, u'video id')
2749 self.report_extraction(video_id)
2751 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
2754 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
2755 webpage, u'thumbnail', fatal=False)
2757 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
2758 webpage, u'description', fatal=False)
2760 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
2761 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
2763 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
2770 'title': video_title,
2771 'thumbnail': thumbnail,
2772 'description': video_description,
2775 class XHamsterIE(InfoExtractor):
2776 """Information Extractor for xHamster"""
2777 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
2779 def _real_extract(self,url):
2780 mobj = re.match(self._VALID_URL, url)
2782 video_id = mobj.group('id')
2783 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
2784 webpage = self._download_webpage(mrss_url, video_id)
2786 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
2788 raise ExtractorError(u'Unable to extract media URL')
2789 if len(mobj.group('server')) == 0:
2790 video_url = compat_urllib_parse.unquote(mobj.group('file'))
2792 video_url = mobj.group('server')+'/key='+mobj.group('file')
2793 video_extension = video_url.split('.')[-1]
2795 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
2798 # Can't see the description anywhere in the UI
2799 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
2800 # webpage, u'description', fatal=False)
2801 # if video_description: video_description = unescapeHTML(video_description)
2803 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
2805 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
2807 video_upload_date = None
2808 self._downloader.report_warning(u'Unable to extract upload date')
2810 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
2811 webpage, u'uploader id', default=u'anonymous')
2813 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
2814 webpage, u'thumbnail', fatal=False)
2819 'ext': video_extension,
2820 'title': video_title,
2821 # 'description': video_description,
2822 'upload_date': video_upload_date,
2823 'uploader_id': video_uploader_id,
2824 'thumbnail': video_thumbnail
2827 class HypemIE(InfoExtractor):
2828 """Information Extractor for hypem"""
2829 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
2831 def _real_extract(self, url):
2832 mobj = re.match(self._VALID_URL, url)
2834 raise ExtractorError(u'Invalid URL: %s' % url)
2835 track_id = mobj.group(1)
2837 data = { 'ax': 1, 'ts': time.time() }
2838 data_encoded = compat_urllib_parse.urlencode(data)
2839 complete_url = url + "?" + data_encoded
2840 request = compat_urllib_request.Request(complete_url)
2841 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
2842 cookie = urlh.headers.get('Set-Cookie', '')
2844 self.report_extraction(track_id)
2846 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
2847 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
2849 track_list = json.loads(html_tracks)
2850 track = track_list[u'tracks'][0]
2852 raise ExtractorError(u'Hypemachine contained invalid JSON.')
2855 track_id = track[u"id"]
2856 artist = track[u"artist"]
2857 title = track[u"song"]
2859 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
2860 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
2861 request.add_header('cookie', cookie)
2862 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
2864 song_data = json.loads(song_data_json)
2866 raise ExtractorError(u'Hypemachine contained invalid JSON.')
2867 final_url = song_data[u"url"]
2877 class Vbox7IE(InfoExtractor):
2878 """Information Extractor for Vbox7"""
2879 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
2881 def _real_extract(self,url):
2882 mobj = re.match(self._VALID_URL, url)
2884 raise ExtractorError(u'Invalid URL: %s' % url)
2885 video_id = mobj.group(1)
2887 redirect_page, urlh = self._download_webpage_handle(url, video_id)
2888 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
2889 redirect_url = urlh.geturl() + new_location
2890 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
2892 title = self._html_search_regex(r'<title>(.*)</title>',
2893 webpage, u'title').split('/')[0].strip()
2896 info_url = "http://vbox7.com/play/magare.do"
2897 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
2898 info_request = compat_urllib_request.Request(info_url, data)
2899 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
2900 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
2901 if info_response is None:
2902 raise ExtractorError(u'Unable to extract the media url')
2903 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
2910 'thumbnail': thumbnail_url,
2913 class GametrailersIE(InfoExtractor):
2914 _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
2916 def _real_extract(self, url):
2917 mobj = re.match(self._VALID_URL, url)
2919 raise ExtractorError(u'Invalid URL: %s' % url)
2920 video_id = mobj.group('id')
2921 video_type = mobj.group('type')
2922 webpage = self._download_webpage(url, video_id)
2923 if video_type == 'full-episodes':
2924 mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
2926 mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
2927 mgid = self._search_regex(mgid_re, webpage, u'mgid')
2928 data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
2930 info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
2931 video_id, u'Downloading video info')
2932 links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
2933 video_id, u'Downloading video urls info')
2935 self.report_extraction(video_id)
2936 info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
2937 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
2939 <url>(?P<thumb>.*?)</url>.*
2942 m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
2944 raise ExtractorError(u'Unable to extract video info')
2945 video_title = m_info.group('title')
2946 video_description = m_info.group('description')
2947 video_thumb = m_info.group('thumb')
2949 m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
2950 if m_urls is None or len(m_urls) == 0:
2951 raise ExtractError(u'Unable to extrat video url')
2952 # They are sorted from worst to best quality
2953 video_url = m_urls[-1].group('url')
2955 return {'url': video_url,
2957 'title': video_title,
2958 # Videos are actually flv not mp4
2960 'thumbnail': video_thumb,
2961 'description': video_description,
2964 def gen_extractors():
2965 """ Return a list of an instance of every supported extractor.
2966 The order does matter; the first extractor matched is the one handling the URL.
2969 YoutubePlaylistIE(),
2994 StanfordOpenClassroomIE(),
3004 WorldStarHipHopIE(),
3034 def get_info_extractor(ie_name):
3035 """Returns the info extractor class with the given ie_name"""
3036 return globals()[ie_name+'IE']