10 import xml.etree.ElementTree
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.collegehumor import CollegeHumorIE
26 from .extractor.dailymotion import DailymotionIE
27 from .extractor.depositfiles import DepositFilesIE
28 from .extractor.escapist import EscapistIE
29 from .extractor.facebook import FacebookIE
30 from .extractor.gametrailers import GametrailersIE
31 from .extractor.generic import GenericIE
32 from .extractor.googleplus import GooglePlusIE
33 from .extractor.googlesearch import GoogleSearchIE
34 from .extractor.infoq import InfoQIE
35 from .extractor.metacafe import MetacafeIE
36 from .extractor.mtv import MTVIE
37 from .extractor.myvideo import MyVideoIE
38 from .extractor.nba import NBAIE
39 from .extractor.statigram import StatigramIE
40 from .extractor.photobucket import PhotobucketIE
41 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
42 from .extractor.stanfordoc import StanfordOpenClassroomIE
43 from .extractor.vimeo import VimeoIE
44 from .extractor.xvideos import XVideosIE
45 from .extractor.yahoo import YahooIE, YahooSearchIE
46 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
47 from .extractor.zdf import ZDFIE
51 class MixcloudIE(InfoExtractor):
52 """Information extractor for www.mixcloud.com"""
54 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
55 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
58 def report_download_json(self, file_id):
59 """Report JSON download."""
60 self.to_screen(u'Downloading json')
62 def get_urls(self, jsonData, fmt, bitrate='best'):
63 """Get urls from 'audio_formats' section in json"""
66 bitrate_list = jsonData[fmt]
67 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
68 bitrate = max(bitrate_list) # select highest
70 url_list = jsonData[fmt][bitrate]
71 except TypeError: # we have no bitrate info.
72 url_list = jsonData[fmt]
75 def check_urls(self, url_list):
76 """Returns 1st active url from list"""
79 compat_urllib_request.urlopen(url)
81 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
86 def _print_formats(self, formats):
87 print('Available formats:')
88 for fmt in formats.keys():
89 for b in formats[fmt]:
91 ext = formats[fmt][b][0]
92 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
93 except TypeError: # we have no bitrate info
95 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
98 def _real_extract(self, url):
99 mobj = re.match(self._VALID_URL, url)
101 raise ExtractorError(u'Invalid URL: %s' % url)
102 # extract uploader & filename from url
103 uploader = mobj.group(1).decode('utf-8')
104 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
106 # construct API request
107 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
108 # retrieve .json file with links to files
109 request = compat_urllib_request.Request(file_url)
111 self.report_download_json(file_url)
112 jsonData = compat_urllib_request.urlopen(request).read()
113 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
114 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
117 json_data = json.loads(jsonData)
118 player_url = json_data['player_swf_url']
119 formats = dict(json_data['audio_formats'])
121 req_format = self._downloader.params.get('format', None)
124 if self._downloader.params.get('listformats', None):
125 self._print_formats(formats)
128 if req_format is None or req_format == 'best':
129 for format_param in formats.keys():
130 url_list = self.get_urls(formats, format_param)
132 file_url = self.check_urls(url_list)
133 if file_url is not None:
136 if req_format not in formats:
137 raise ExtractorError(u'Format is not available')
139 url_list = self.get_urls(formats, req_format)
140 file_url = self.check_urls(url_list)
141 format_param = req_format
144 'id': file_id.decode('utf-8'),
145 'url': file_url.decode('utf-8'),
146 'uploader': uploader.decode('utf-8'),
148 'title': json_data['name'],
149 'ext': file_url.split('.')[-1].decode('utf-8'),
150 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
151 'thumbnail': json_data['thumbnail_url'],
152 'description': json_data['description'],
153 'player_url': player_url.decode('utf-8'),
159 class YoukuIE(InfoExtractor):
160 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
163 nowTime = int(time.time() * 1000)
164 random1 = random.randint(1000,1998)
165 random2 = random.randint(1000,9999)
167 return "%d%d%d" %(nowTime,random1,random2)
169 def _get_file_ID_mix_string(self, seed):
171 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
173 for i in range(len(source)):
174 seed = (seed * 211 + 30031 ) % 65536
175 index = math.floor(seed / 65536 * len(source) )
176 mixed.append(source[int(index)])
177 source.remove(source[int(index)])
178 #return ''.join(mixed)
181 def _get_file_id(self, fileId, seed):
182 mixed = self._get_file_ID_mix_string(seed)
183 ids = fileId.split('*')
187 realId.append(mixed[int(ch)])
188 return ''.join(realId)
190 def _real_extract(self, url):
191 mobj = re.match(self._VALID_URL, url)
193 raise ExtractorError(u'Invalid URL: %s' % url)
194 video_id = mobj.group('ID')
196 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
198 jsondata = self._download_webpage(info_url, video_id)
200 self.report_extraction(video_id)
202 config = json.loads(jsondata)
204 video_title = config['data'][0]['title']
205 seed = config['data'][0]['seed']
207 format = self._downloader.params.get('format', None)
208 supported_format = list(config['data'][0]['streamfileids'].keys())
210 if format is None or format == 'best':
211 if 'hd2' in supported_format:
216 elif format == 'worst':
224 fileid = config['data'][0]['streamfileids'][format]
225 keys = [s['k'] for s in config['data'][0]['segs'][format]]
226 except (UnicodeDecodeError, ValueError, KeyError):
227 raise ExtractorError(u'Unable to extract info section')
230 sid = self._gen_sid()
231 fileid = self._get_file_id(fileid, seed)
233 #column 8,9 of fileid represent the segment number
234 #fileid[7:9] should be changed
235 for index, key in enumerate(keys):
237 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
238 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
241 'id': '%s_part%02d' % (video_id, index),
245 'title': video_title,
248 files_info.append(info)
253 class XNXXIE(InfoExtractor):
254 """Information extractor for xnxx.com"""
256 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
258 VIDEO_URL_RE = r'flv_url=(.*?)&'
259 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
260 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
262 def _real_extract(self, url):
263 mobj = re.match(self._VALID_URL, url)
265 raise ExtractorError(u'Invalid URL: %s' % url)
266 video_id = mobj.group(1)
268 # Get webpage content
269 webpage = self._download_webpage(url, video_id)
271 video_url = self._search_regex(self.VIDEO_URL_RE,
272 webpage, u'video URL')
273 video_url = compat_urllib_parse.unquote(video_url)
275 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
278 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
279 webpage, u'thumbnail', fatal=False)
286 'title': video_title,
288 'thumbnail': video_thumbnail,
295 class JustinTVIE(InfoExtractor):
296 """Information extractor for justin.tv and twitch.tv"""
297 # TODO: One broadcast may be split into multiple videos. The key
298 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
299 # starts at 1 and increases. Can we treat all parts as one video?
301 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
303 (?P<channelid>[^/]+)|
304 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
305 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
309 _JUSTIN_PAGE_LIMIT = 100
310 IE_NAME = u'justin.tv'
312 def report_download_page(self, channel, offset):
313 """Report attempt to download a single page of videos."""
314 self.to_screen(u'%s: Downloading video information from %d to %d' %
315 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
317 # Return count of items, list of *valid* items
318 def _parse_page(self, url, video_id):
319 webpage = self._download_webpage(url, video_id,
320 u'Downloading video info JSON',
321 u'unable to download video info JSON')
323 response = json.loads(webpage)
324 if type(response) != list:
325 error_text = response.get('error', 'unknown error')
326 raise ExtractorError(u'Justin.tv API: %s' % error_text)
328 for clip in response:
329 video_url = clip['video_file_url']
331 video_extension = os.path.splitext(video_url)[1][1:]
332 video_date = re.sub('-', '', clip['start_time'][:10])
333 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
334 video_id = clip['id']
335 video_title = clip.get('title', video_id)
339 'title': video_title,
340 'uploader': clip.get('channel_name', video_uploader_id),
341 'uploader_id': video_uploader_id,
342 'upload_date': video_date,
343 'ext': video_extension,
345 return (len(response), info)
347 def _real_extract(self, url):
348 mobj = re.match(self._VALID_URL, url)
350 raise ExtractorError(u'invalid URL: %s' % url)
352 api_base = 'http://api.justin.tv'
354 if mobj.group('channelid'):
356 video_id = mobj.group('channelid')
357 api = api_base + '/channel/archives/%s.json' % video_id
358 elif mobj.group('chapterid'):
359 chapter_id = mobj.group('chapterid')
361 webpage = self._download_webpage(url, chapter_id)
362 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
364 raise ExtractorError(u'Cannot find archive of a chapter')
365 archive_id = m.group(1)
367 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
368 chapter_info_xml = self._download_webpage(api, chapter_id,
369 note=u'Downloading chapter information',
370 errnote=u'Chapter information download failed')
371 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
372 for a in doc.findall('.//archive'):
373 if archive_id == a.find('./id').text:
376 raise ExtractorError(u'Could not find chapter in chapter information')
378 video_url = a.find('./video_file_url').text
379 video_ext = video_url.rpartition('.')[2] or u'flv'
381 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
382 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
383 note='Downloading chapter metadata',
384 errnote='Download of chapter metadata failed')
385 chapter_info = json.loads(chapter_info_json)
387 bracket_start = int(doc.find('.//bracket_start').text)
388 bracket_end = int(doc.find('.//bracket_end').text)
390 # TODO determine start (and probably fix up file)
391 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
392 #video_url += u'?start=' + TODO:start_timestamp
393 # bracket_start is 13290, but we want 51670615
394 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
395 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
398 'id': u'c' + chapter_id,
401 'title': chapter_info['title'],
402 'thumbnail': chapter_info['preview'],
403 'description': chapter_info['description'],
404 'uploader': chapter_info['channel']['display_name'],
405 'uploader_id': chapter_info['channel']['name'],
409 video_id = mobj.group('videoid')
410 api = api_base + '/broadcast/by_archive/%s.json' % video_id
412 self.report_extraction(video_id)
416 limit = self._JUSTIN_PAGE_LIMIT
419 self.report_download_page(video_id, offset)
420 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
421 page_count, page_info = self._parse_page(page_url, video_id)
422 info.extend(page_info)
423 if not paged or page_count != limit:
428 class FunnyOrDieIE(InfoExtractor):
429 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
431 def _real_extract(self, url):
432 mobj = re.match(self._VALID_URL, url)
434 raise ExtractorError(u'invalid URL: %s' % url)
436 video_id = mobj.group('id')
437 webpage = self._download_webpage(url, video_id)
439 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
440 webpage, u'video URL', flags=re.DOTALL)
442 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
443 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
445 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
446 webpage, u'description', fatal=False, flags=re.DOTALL)
453 'description': video_description,
457 class SteamIE(InfoExtractor):
458 _VALID_URL = r"""http://store\.steampowered\.com/
460 (?P<urltype>video|app)/ #If the page is only for videos or for a game
462 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
464 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
465 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
468 def suitable(cls, url):
469 """Receives a URL and returns True if suitable for this IE."""
470 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
472 def _real_extract(self, url):
473 m = re.match(self._VALID_URL, url, re.VERBOSE)
474 gameID = m.group('gameID')
476 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
477 webpage = self._download_webpage(videourl, gameID)
479 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
480 videourl = self._AGECHECK_TEMPLATE % gameID
481 self.report_age_confirmation()
482 webpage = self._download_webpage(videourl, gameID)
484 self.report_extraction(gameID)
485 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
486 webpage, 'game title')
488 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
489 mweb = re.finditer(urlRE, webpage)
490 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
491 titles = re.finditer(namesRE, webpage)
492 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
493 thumbs = re.finditer(thumbsRE, webpage)
495 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
496 video_id = vid.group('videoID')
497 title = vtitle.group('videoName')
498 video_url = vid.group('videoURL')
499 video_thumb = thumb.group('thumbnail')
501 raise ExtractorError(u'Cannot find video url for %s' % video_id)
506 'title': unescapeHTML(title),
507 'thumbnail': video_thumb
510 return [self.playlist_result(videos, gameID, game_title)]
512 class UstreamIE(InfoExtractor):
513 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
516 def _real_extract(self, url):
517 m = re.match(self._VALID_URL, url)
518 video_id = m.group('videoID')
520 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
521 webpage = self._download_webpage(url, video_id)
523 self.report_extraction(video_id)
525 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
528 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
529 webpage, u'uploader', fatal=False, flags=re.DOTALL)
531 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
532 webpage, u'thumbnail', fatal=False)
538 'title': video_title,
539 'uploader': uploader,
540 'thumbnail': thumbnail,
544 class WorldStarHipHopIE(InfoExtractor):
545 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
546 IE_NAME = u'WorldStarHipHop'
548 def _real_extract(self, url):
549 m = re.match(self._VALID_URL, url)
550 video_id = m.group('id')
552 webpage_src = self._download_webpage(url, video_id)
554 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
555 webpage_src, u'video URL')
557 if 'mp4' in video_url:
562 video_title = self._html_search_regex(r"<title>(.*)</title>",
563 webpage_src, u'title')
565 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
566 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
567 webpage_src, u'thumbnail', fatal=False)
570 _title = r"""candytitles.*>(.*)</span>"""
571 mobj = re.search(_title, webpage_src)
573 video_title = mobj.group(1)
578 'title' : video_title,
579 'thumbnail' : thumbnail,
584 class RBMARadioIE(InfoExtractor):
585 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
587 def _real_extract(self, url):
588 m = re.match(self._VALID_URL, url)
589 video_id = m.group('videoID')
591 webpage = self._download_webpage(url, video_id)
593 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
594 webpage, u'json data', flags=re.MULTILINE)
597 data = json.loads(json_data)
598 except ValueError as e:
599 raise ExtractorError(u'Invalid JSON: ' + str(e))
601 video_url = data['akamai_url'] + '&cbr=256'
602 url_parts = compat_urllib_parse_urlparse(video_url)
603 video_ext = url_parts.path.rpartition('.')[2]
608 'title': data['title'],
609 'description': data.get('teaser_text'),
610 'location': data.get('country_of_origin'),
611 'uploader': data.get('host', {}).get('name'),
612 'uploader_id': data.get('host', {}).get('slug'),
613 'thumbnail': data.get('image', {}).get('large_url_2x'),
614 'duration': data.get('duration'),
619 class YouPornIE(InfoExtractor):
620 """Information extractor for youporn.com."""
621 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
623 def _print_formats(self, formats):
624 """Print all available formats"""
625 print(u'Available formats:')
626 print(u'ext\t\tformat')
627 print(u'---------------------------------')
628 for format in formats:
629 print(u'%s\t\t%s' % (format['ext'], format['format']))
631 def _specific(self, req_format, formats):
633 if(x["format"]==req_format):
637 def _real_extract(self, url):
638 mobj = re.match(self._VALID_URL, url)
640 raise ExtractorError(u'Invalid URL: %s' % url)
641 video_id = mobj.group('videoid')
643 req = compat_urllib_request.Request(url)
644 req.add_header('Cookie', 'age_verified=1')
645 webpage = self._download_webpage(req, video_id)
647 # Get JSON parameters
648 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
650 params = json.loads(json_params)
652 raise ExtractorError(u'Invalid JSON')
654 self.report_extraction(video_id)
656 video_title = params['title']
657 upload_date = unified_strdate(params['release_date_f'])
658 video_description = params['description']
659 video_uploader = params['submitted_by']
660 thumbnail = params['thumbnails'][0]['image']
662 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
664 # Get all of the formats available
665 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
666 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
667 webpage, u'download list').strip()
669 # Get all of the links from the page
670 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
671 links = re.findall(LINK_RE, download_list_html)
673 raise ExtractorError(u'ERROR: no known formats available for video')
675 self.to_screen(u'Links found: %d' % len(links))
680 # A link looks like this:
681 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
682 # A path looks like this:
683 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
684 video_url = unescapeHTML( link )
685 path = compat_urllib_parse_urlparse( video_url ).path
686 extension = os.path.splitext( path )[1][1:]
687 format = path.split('/')[4].split('_')[:2]
690 format = "-".join( format )
691 # title = u'%s-%s-%s' % (video_title, size, bitrate)
696 'uploader': video_uploader,
697 'upload_date': upload_date,
698 'title': video_title,
701 'thumbnail': thumbnail,
702 'description': video_description
705 if self._downloader.params.get('listformats', None):
706 self._print_formats(formats)
709 req_format = self._downloader.params.get('format', None)
710 self.to_screen(u'Format: %s' % req_format)
712 if req_format is None or req_format == 'best':
714 elif req_format == 'worst':
716 elif req_format in ('-1', 'all'):
719 format = self._specific( req_format, formats )
721 raise ExtractorError(u'Requested format not available')
726 class PornotubeIE(InfoExtractor):
727 """Information extractor for pornotube.com."""
728 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
730 def _real_extract(self, url):
731 mobj = re.match(self._VALID_URL, url)
733 raise ExtractorError(u'Invalid URL: %s' % url)
735 video_id = mobj.group('videoid')
736 video_title = mobj.group('title')
738 # Get webpage content
739 webpage = self._download_webpage(url, video_id)
742 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
743 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
744 video_url = compat_urllib_parse.unquote(video_url)
746 #Get the uploaded date
747 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
748 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
749 if upload_date: upload_date = unified_strdate(upload_date)
751 info = {'id': video_id,
754 'upload_date': upload_date,
755 'title': video_title,
761 class YouJizzIE(InfoExtractor):
762 """Information extractor for youjizz.com."""
763 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
765 def _real_extract(self, url):
766 mobj = re.match(self._VALID_URL, url)
768 raise ExtractorError(u'Invalid URL: %s' % url)
770 video_id = mobj.group('videoid')
772 # Get webpage content
773 webpage = self._download_webpage(url, video_id)
775 # Get the video title
776 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
777 webpage, u'title').strip()
780 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
782 raise ExtractorError(u'ERROR: unable to extract embed page')
784 embed_page_url = result.group(0).strip()
785 video_id = result.group('videoid')
787 webpage = self._download_webpage(embed_page_url, video_id)
790 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
791 webpage, u'video URL')
793 info = {'id': video_id,
795 'title': video_title,
798 'player_url': embed_page_url}
802 class EightTracksIE(InfoExtractor):
804 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
806 def _real_extract(self, url):
807 mobj = re.match(self._VALID_URL, url)
809 raise ExtractorError(u'Invalid URL: %s' % url)
810 playlist_id = mobj.group('id')
812 webpage = self._download_webpage(url, playlist_id)
814 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
815 data = json.loads(json_like)
817 session = str(random.randint(0, 1000000000))
819 track_count = data['tracks_count']
820 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
823 for i in itertools.count():
824 api_json = self._download_webpage(next_url, playlist_id,
825 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
826 errnote=u'Failed to download song information')
827 api_data = json.loads(api_json)
828 track_data = api_data[u'set']['track']
830 'id': track_data['id'],
831 'url': track_data['track_file_stream_url'],
832 'title': track_data['performer'] + u' - ' + track_data['name'],
833 'raw_title': track_data['name'],
834 'uploader_id': data['user']['login'],
838 if api_data['set']['at_last_track']:
840 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
843 class KeekIE(InfoExtractor):
844 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
847 def _real_extract(self, url):
848 m = re.match(self._VALID_URL, url)
849 video_id = m.group('videoID')
851 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
852 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
853 webpage = self._download_webpage(url, video_id)
855 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
858 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
859 webpage, u'uploader', fatal=False)
865 'title': video_title,
866 'thumbnail': thumbnail,
871 class TEDIE(InfoExtractor):
872 _VALID_URL=r'''http://www\.ted\.com/
874 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
876 ((?P<type_talk>talks)) # We have a simple talk
878 (/lang/(.*?))? # The url may contain the language
879 /(?P<name>\w+) # Here goes the name and then ".html"
883 def suitable(cls, url):
884 """Receives a URL and returns True if suitable for this IE."""
885 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
887 def _real_extract(self, url):
888 m=re.match(self._VALID_URL, url, re.VERBOSE)
889 if m.group('type_talk'):
890 return [self._talk_info(url)]
892 playlist_id=m.group('playlist_id')
894 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
895 return [self._playlist_videos_info(url,name,playlist_id)]
897 def _playlist_videos_info(self,url,name,playlist_id=0):
898 '''Returns the videos of the playlist'''
900 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
901 ([.\s]*?)data-playlist_item_id="(\d+)"
902 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
904 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
905 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
906 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
907 m_names=re.finditer(video_name_RE,webpage)
909 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
910 webpage, 'playlist title')
912 playlist_entries = []
913 for m_video, m_name in zip(m_videos,m_names):
914 video_id=m_video.group('video_id')
915 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
916 playlist_entries.append(self.url_result(talk_url, 'TED'))
917 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
919 def _talk_info(self, url, video_id=0):
920 """Return the video for the talk in the url"""
921 m = re.match(self._VALID_URL, url,re.VERBOSE)
922 video_name = m.group('name')
923 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
924 self.report_extraction(video_name)
925 # If the url includes the language we get the title translated
926 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
928 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
929 webpage, 'json data')
930 info = json.loads(json_data)
931 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
932 webpage, 'description', flags = re.DOTALL)
934 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
935 webpage, 'thumbnail')
938 'url': info['htmlStreams'][-1]['file'],
941 'thumbnail': thumbnail,
946 class MySpassIE(InfoExtractor):
947 _VALID_URL = r'http://www.myspass.de/.*'
949 def _real_extract(self, url):
950 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
952 # video id is the last path element of the URL
953 # usually there is a trailing slash, so also try the second but last
954 url_path = compat_urllib_parse_urlparse(url).path
955 url_parent_path, video_id = os.path.split(url_path)
957 _, video_id = os.path.split(url_parent_path)
960 metadata_url = META_DATA_URL_TEMPLATE % video_id
961 metadata_text = self._download_webpage(metadata_url, video_id)
962 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
964 # extract values from metadata
965 url_flv_el = metadata.find('url_flv')
966 if url_flv_el is None:
967 raise ExtractorError(u'Unable to extract download url')
968 video_url = url_flv_el.text
969 extension = os.path.splitext(video_url)[1][1:]
970 title_el = metadata.find('title')
972 raise ExtractorError(u'Unable to extract title')
973 title = title_el.text
974 format_id_el = metadata.find('format_id')
975 if format_id_el is None:
978 format = format_id_el.text
979 description_el = metadata.find('description')
980 if description_el is not None:
981 description = description_el.text
984 imagePreview_el = metadata.find('imagePreview')
985 if imagePreview_el is not None:
986 thumbnail = imagePreview_el.text
995 'thumbnail': thumbnail,
996 'description': description
1000 class SpiegelIE(InfoExtractor):
1001 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
1003 def _real_extract(self, url):
1004 m = re.match(self._VALID_URL, url)
1005 video_id = m.group('videoID')
1007 webpage = self._download_webpage(url, video_id)
1009 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
1012 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
1013 xml_code = self._download_webpage(xml_url, video_id,
1014 note=u'Downloading XML', errnote=u'Failed to download XML')
1016 idoc = xml.etree.ElementTree.fromstring(xml_code)
1017 last_type = idoc[-1]
1018 filename = last_type.findall('./filename')[0].text
1019 duration = float(last_type.findall('./duration')[0].text)
1021 video_url = 'http://video2.spiegel.de/flash/' + filename
1022 video_ext = filename.rpartition('.')[2]
1027 'title': video_title,
1028 'duration': duration,
1032 class LiveLeakIE(InfoExtractor):
1034 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
1035 IE_NAME = u'liveleak'
1037 def _real_extract(self, url):
1038 mobj = re.match(self._VALID_URL, url)
1040 raise ExtractorError(u'Invalid URL: %s' % url)
1042 video_id = mobj.group('video_id')
1044 webpage = self._download_webpage(url, video_id)
1046 video_url = self._search_regex(r'file: "(.*?)",',
1047 webpage, u'video URL')
1049 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1050 webpage, u'title').replace('LiveLeak.com -', '').strip()
1052 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1053 webpage, u'description', fatal=False)
1055 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
1056 webpage, u'uploader', fatal=False)
1062 'title': video_title,
1063 'description': video_description,
1064 'uploader': video_uploader
1071 class TumblrIE(InfoExtractor):
1072 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1074 def _real_extract(self, url):
1075 m_url = re.match(self._VALID_URL, url)
1076 video_id = m_url.group('id')
1077 blog = m_url.group('blog_name')
1079 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
1080 webpage = self._download_webpage(url, video_id)
1082 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
1083 video = re.search(re_video, webpage)
1085 raise ExtractorError(u'Unable to extract video')
1086 video_url = video.group('video_url')
1087 ext = video.group('ext')
1089 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1090 webpage, u'thumbnail', fatal=False) # We pick the first poster
1091 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
1093 # The only place where you can get a title, it's not complete,
1094 # but searching in other places doesn't work for all videos
1095 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
1096 webpage, u'title', flags=re.DOTALL)
1098 return [{'id': video_id,
1100 'title': video_title,
1101 'thumbnail': video_thumbnail,
1105 class BandcampIE(InfoExtractor):
1106 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
1108 def _real_extract(self, url):
1109 mobj = re.match(self._VALID_URL, url)
1110 title = mobj.group('title')
1111 webpage = self._download_webpage(url, title)
1112 # We get the link to the free download page
1113 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
1114 if m_download is None:
1115 raise ExtractorError(u'No free songs found')
1117 download_link = m_download.group(1)
1118 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
1119 webpage, re.MULTILINE|re.DOTALL).group('id')
1121 download_webpage = self._download_webpage(download_link, id,
1122 'Downloading free downloads page')
1123 # We get the dictionary of the track from some javascrip code
1124 info = re.search(r'items: (.*?),$',
1125 download_webpage, re.MULTILINE).group(1)
1126 info = json.loads(info)[0]
1127 # We pick mp3-320 for now, until format selection can be easily implemented.
1128 mp3_info = info[u'downloads'][u'mp3-320']
1129 # If we try to use this url it says the link has expired
1130 initial_url = mp3_info[u'url']
1131 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
1132 m_url = re.match(re_url, initial_url)
1133 #We build the url we will use to get the final track url
1134 # This url is build in Bandcamp in the script download_bunde_*.js
1135 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
1136 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
1137 # If we could correctly generate the .rand field the url would be
1138 #in the "download_url" key
1139 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
1141 track_info = {'id':id,
1142 'title' : info[u'title'],
1145 'thumbnail' : info[u'thumb_url'],
1146 'uploader' : info[u'artist']
1151 class RedTubeIE(InfoExtractor):
1152 """Information Extractor for redtube"""
1153 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
1155 def _real_extract(self,url):
1156 mobj = re.match(self._VALID_URL, url)
1158 raise ExtractorError(u'Invalid URL: %s' % url)
1160 video_id = mobj.group('id')
1161 video_extension = 'mp4'
1162 webpage = self._download_webpage(url, video_id)
1164 self.report_extraction(video_id)
1166 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
1167 webpage, u'video URL')
1169 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
1175 'ext': video_extension,
1176 'title': video_title,
1179 class InaIE(InfoExtractor):
1180 """Information Extractor for Ina.fr"""
1181 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1183 def _real_extract(self,url):
1184 mobj = re.match(self._VALID_URL, url)
1186 video_id = mobj.group('id')
1187 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1188 video_extension = 'mp4'
1189 webpage = self._download_webpage(mrss_url, video_id)
1191 self.report_extraction(video_id)
1193 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1194 webpage, u'video URL')
1196 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1202 'ext': video_extension,
1203 'title': video_title,
1206 class HowcastIE(InfoExtractor):
1207 """Information Extractor for Howcast.com"""
1208 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1210 def _real_extract(self, url):
1211 mobj = re.match(self._VALID_URL, url)
1213 video_id = mobj.group('id')
1214 webpage_url = 'http://www.howcast.com/videos/' + video_id
1215 webpage = self._download_webpage(webpage_url, video_id)
1217 self.report_extraction(video_id)
1219 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1220 webpage, u'video URL')
1222 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1225 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1226 webpage, u'description', fatal=False)
1228 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1229 webpage, u'thumbnail', fatal=False)
1235 'title': video_title,
1236 'description': video_description,
1237 'thumbnail': thumbnail,
1240 class VineIE(InfoExtractor):
1241 """Information Extractor for Vine.co"""
1242 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1244 def _real_extract(self, url):
1245 mobj = re.match(self._VALID_URL, url)
1247 video_id = mobj.group('id')
1248 webpage_url = 'https://vine.co/v/' + video_id
1249 webpage = self._download_webpage(webpage_url, video_id)
1251 self.report_extraction(video_id)
1253 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1254 webpage, u'video URL')
1256 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1259 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
1260 webpage, u'thumbnail', fatal=False)
1262 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
1263 webpage, u'uploader', fatal=False, flags=re.DOTALL)
1269 'title': video_title,
1270 'thumbnail': thumbnail,
1271 'uploader': uploader,
1274 class FlickrIE(InfoExtractor):
1275 """Information Extractor for Flickr videos"""
1276 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1278 def _real_extract(self, url):
1279 mobj = re.match(self._VALID_URL, url)
1281 video_id = mobj.group('id')
1282 video_uploader_id = mobj.group('uploader_id')
1283 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1284 webpage = self._download_webpage(webpage_url, video_id)
1286 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
1288 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1289 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1291 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1292 first_xml, u'node_id')
1294 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1295 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1297 self.report_extraction(video_id)
1299 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
1301 raise ExtractorError(u'Unable to extract video url')
1302 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1304 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
1305 webpage, u'video title')
1307 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
1308 webpage, u'description', fatal=False)
1310 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
1311 webpage, u'thumbnail', fatal=False)
1317 'title': video_title,
1318 'description': video_description,
1319 'thumbnail': thumbnail,
1320 'uploader_id': video_uploader_id,
1323 class TeamcocoIE(InfoExtractor):
1324 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
1326 def _real_extract(self, url):
1327 mobj = re.match(self._VALID_URL, url)
1329 raise ExtractorError(u'Invalid URL: %s' % url)
1330 url_title = mobj.group('url_title')
1331 webpage = self._download_webpage(url, url_title)
1333 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
1334 webpage, u'video id')
1336 self.report_extraction(video_id)
1338 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1341 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
1342 webpage, u'thumbnail', fatal=False)
1344 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
1345 webpage, u'description', fatal=False)
1347 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
1348 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
1350 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
1357 'title': video_title,
1358 'thumbnail': thumbnail,
1359 'description': video_description,
1362 class XHamsterIE(InfoExtractor):
1363 """Information Extractor for xHamster"""
1364 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
1366 def _real_extract(self,url):
1367 mobj = re.match(self._VALID_URL, url)
1369 video_id = mobj.group('id')
1370 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
1371 webpage = self._download_webpage(mrss_url, video_id)
1373 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
1375 raise ExtractorError(u'Unable to extract media URL')
1376 if len(mobj.group('server')) == 0:
1377 video_url = compat_urllib_parse.unquote(mobj.group('file'))
1379 video_url = mobj.group('server')+'/key='+mobj.group('file')
1380 video_extension = video_url.split('.')[-1]
1382 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
1385 # Can't see the description anywhere in the UI
1386 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1387 # webpage, u'description', fatal=False)
1388 # if video_description: video_description = unescapeHTML(video_description)
1390 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
1392 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
1394 video_upload_date = None
1395 self._downloader.report_warning(u'Unable to extract upload date')
1397 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1398 webpage, u'uploader id', default=u'anonymous')
1400 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1401 webpage, u'thumbnail', fatal=False)
1406 'ext': video_extension,
1407 'title': video_title,
1408 # 'description': video_description,
1409 'upload_date': video_upload_date,
1410 'uploader_id': video_uploader_id,
1411 'thumbnail': video_thumbnail
1414 class HypemIE(InfoExtractor):
1415 """Information Extractor for hypem"""
1416 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1418 def _real_extract(self, url):
1419 mobj = re.match(self._VALID_URL, url)
1421 raise ExtractorError(u'Invalid URL: %s' % url)
1422 track_id = mobj.group(1)
1424 data = { 'ax': 1, 'ts': time.time() }
1425 data_encoded = compat_urllib_parse.urlencode(data)
1426 complete_url = url + "?" + data_encoded
1427 request = compat_urllib_request.Request(complete_url)
1428 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1429 cookie = urlh.headers.get('Set-Cookie', '')
1431 self.report_extraction(track_id)
1433 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1434 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1436 track_list = json.loads(html_tracks)
1437 track = track_list[u'tracks'][0]
1439 raise ExtractorError(u'Hypemachine contained invalid JSON.')
1442 track_id = track[u"id"]
1443 artist = track[u"artist"]
1444 title = track[u"song"]
1446 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1447 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1448 request.add_header('cookie', cookie)
1449 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1451 song_data = json.loads(song_data_json)
1453 raise ExtractorError(u'Hypemachine contained invalid JSON.')
1454 final_url = song_data[u"url"]
1464 class Vbox7IE(InfoExtractor):
1465 """Information Extractor for Vbox7"""
1466 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1468 def _real_extract(self,url):
1469 mobj = re.match(self._VALID_URL, url)
1471 raise ExtractorError(u'Invalid URL: %s' % url)
1472 video_id = mobj.group(1)
1474 redirect_page, urlh = self._download_webpage_handle(url, video_id)
1475 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1476 redirect_url = urlh.geturl() + new_location
1477 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1479 title = self._html_search_regex(r'<title>(.*)</title>',
1480 webpage, u'title').split('/')[0].strip()
1483 info_url = "http://vbox7.com/play/magare.do"
1484 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1485 info_request = compat_urllib_request.Request(info_url, data)
1486 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1487 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1488 if info_response is None:
1489 raise ExtractorError(u'Unable to extract the media url')
1490 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1497 'thumbnail': thumbnail_url,
1501 def gen_extractors():
1502 """ Return a list of an instance of every supported extractor.
1503 The order does matter; the first extractor matched is the one handling the URL.
1506 YoutubePlaylistIE(),
1531 StanfordOpenClassroomIE(),
1541 WorldStarHipHopIE(),
1571 def get_info_extractor(ie_name):
1572 """Returns the info extractor class with the given ie_name"""
1573 return globals()[ie_name+'IE']