10 import xml.etree.ElementTree
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.collegehumor import CollegeHumorIE
26 from .extractor.dailymotion import DailymotionIE
27 from .extractor.depositfiles import DepositFilesIE
28 from .extractor.escapist import EscapistIE
29 from .extractor.facebook import FacebookIE
30 from .extractor.gametrailers import GametrailersIE
31 from .extractor.generic import GenericIE
32 from .extractor.googleplus import GooglePlusIE
33 from .extractor.googlesearch import GoogleSearchIE
34 from .extractor.infoq import InfoQIE
35 from .extractor.metacafe import MetacafeIE
36 from .extractor.myvideo import MyVideoIE
37 from .extractor.nba import NBAIE
38 from .extractor.statigram import StatigramIE
39 from .extractor.photobucket import PhotobucketIE
40 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
41 from .extractor.stanfordoc import StanfordOpenClassroomIE
42 from .extractor.vimeo import VimeoIE
43 from .extractor.xvideos import XVideosIE
44 from .extractor.yahoo import YahooIE, YahooSearchIE
45 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
46 from .extractor.zdf import ZDFIE
50 class MixcloudIE(InfoExtractor):
51 """Information extractor for www.mixcloud.com"""
53 _WORKING = False # New API, but it seems good http://www.mixcloud.com/developers/documentation/
54 _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/([\w\d-]+)/([\w\d-]+)'
57 def report_download_json(self, file_id):
58 """Report JSON download."""
59 self.to_screen(u'Downloading json')
61 def get_urls(self, jsonData, fmt, bitrate='best'):
62 """Get urls from 'audio_formats' section in json"""
65 bitrate_list = jsonData[fmt]
66 if bitrate is None or bitrate == 'best' or bitrate not in bitrate_list:
67 bitrate = max(bitrate_list) # select highest
69 url_list = jsonData[fmt][bitrate]
70 except TypeError: # we have no bitrate info.
71 url_list = jsonData[fmt]
74 def check_urls(self, url_list):
75 """Returns 1st active url from list"""
78 compat_urllib_request.urlopen(url)
80 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
85 def _print_formats(self, formats):
86 print('Available formats:')
87 for fmt in formats.keys():
88 for b in formats[fmt]:
90 ext = formats[fmt][b][0]
91 print('%s\t%s\t[%s]' % (fmt, b, ext.split('.')[-1]))
92 except TypeError: # we have no bitrate info
94 print('%s\t%s\t[%s]' % (fmt, '??', ext.split('.')[-1]))
97 def _real_extract(self, url):
98 mobj = re.match(self._VALID_URL, url)
100 raise ExtractorError(u'Invalid URL: %s' % url)
101 # extract uploader & filename from url
102 uploader = mobj.group(1).decode('utf-8')
103 file_id = uploader + "-" + mobj.group(2).decode('utf-8')
105 # construct API request
106 file_url = 'http://www.mixcloud.com/api/1/cloudcast/' + '/'.join(url.split('/')[-3:-1]) + '.json'
107 # retrieve .json file with links to files
108 request = compat_urllib_request.Request(file_url)
110 self.report_download_json(file_url)
111 jsonData = compat_urllib_request.urlopen(request).read()
112 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
113 raise ExtractorError(u'Unable to retrieve file: %s' % compat_str(err))
116 json_data = json.loads(jsonData)
117 player_url = json_data['player_swf_url']
118 formats = dict(json_data['audio_formats'])
120 req_format = self._downloader.params.get('format', None)
123 if self._downloader.params.get('listformats', None):
124 self._print_formats(formats)
127 if req_format is None or req_format == 'best':
128 for format_param in formats.keys():
129 url_list = self.get_urls(formats, format_param)
131 file_url = self.check_urls(url_list)
132 if file_url is not None:
135 if req_format not in formats:
136 raise ExtractorError(u'Format is not available')
138 url_list = self.get_urls(formats, req_format)
139 file_url = self.check_urls(url_list)
140 format_param = req_format
143 'id': file_id.decode('utf-8'),
144 'url': file_url.decode('utf-8'),
145 'uploader': uploader.decode('utf-8'),
147 'title': json_data['name'],
148 'ext': file_url.split('.')[-1].decode('utf-8'),
149 'format': (format_param is None and u'NA' or format_param.decode('utf-8')),
150 'thumbnail': json_data['thumbnail_url'],
151 'description': json_data['description'],
152 'player_url': player_url.decode('utf-8'),
156 class MTVIE(InfoExtractor):
157 """Information extractor for MTV.com"""
159 _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
162 def _real_extract(self, url):
163 mobj = re.match(self._VALID_URL, url)
165 raise ExtractorError(u'Invalid URL: %s' % url)
166 if not mobj.group('proto'):
167 url = 'http://' + url
168 video_id = mobj.group('videoid')
170 webpage = self._download_webpage(url, video_id)
172 song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
173 webpage, u'song name', fatal=False)
175 video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
178 mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
179 webpage, u'mtvn_uri', fatal=False)
181 content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
182 webpage, u'content id', fatal=False)
184 videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
185 self.report_extraction(video_id)
186 request = compat_urllib_request.Request(videogen_url)
188 metadataXml = compat_urllib_request.urlopen(request).read()
189 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
190 raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
192 mdoc = xml.etree.ElementTree.fromstring(metadataXml)
193 renditions = mdoc.findall('.//rendition')
195 # For now, always pick the highest quality.
196 rendition = renditions[-1]
199 _,_,ext = rendition.attrib['type'].partition('/')
200 format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
201 video_url = rendition.find('./src').text
203 raise ExtractorError('Invalid rendition field.')
208 'uploader': performer,
210 'title': video_title,
218 class YoukuIE(InfoExtractor):
219 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
222 nowTime = int(time.time() * 1000)
223 random1 = random.randint(1000,1998)
224 random2 = random.randint(1000,9999)
226 return "%d%d%d" %(nowTime,random1,random2)
228 def _get_file_ID_mix_string(self, seed):
230 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
232 for i in range(len(source)):
233 seed = (seed * 211 + 30031 ) % 65536
234 index = math.floor(seed / 65536 * len(source) )
235 mixed.append(source[int(index)])
236 source.remove(source[int(index)])
237 #return ''.join(mixed)
240 def _get_file_id(self, fileId, seed):
241 mixed = self._get_file_ID_mix_string(seed)
242 ids = fileId.split('*')
246 realId.append(mixed[int(ch)])
247 return ''.join(realId)
249 def _real_extract(self, url):
250 mobj = re.match(self._VALID_URL, url)
252 raise ExtractorError(u'Invalid URL: %s' % url)
253 video_id = mobj.group('ID')
255 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
257 jsondata = self._download_webpage(info_url, video_id)
259 self.report_extraction(video_id)
261 config = json.loads(jsondata)
263 video_title = config['data'][0]['title']
264 seed = config['data'][0]['seed']
266 format = self._downloader.params.get('format', None)
267 supported_format = list(config['data'][0]['streamfileids'].keys())
269 if format is None or format == 'best':
270 if 'hd2' in supported_format:
275 elif format == 'worst':
283 fileid = config['data'][0]['streamfileids'][format]
284 keys = [s['k'] for s in config['data'][0]['segs'][format]]
285 except (UnicodeDecodeError, ValueError, KeyError):
286 raise ExtractorError(u'Unable to extract info section')
289 sid = self._gen_sid()
290 fileid = self._get_file_id(fileid, seed)
292 #column 8,9 of fileid represent the segment number
293 #fileid[7:9] should be changed
294 for index, key in enumerate(keys):
296 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
297 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
300 'id': '%s_part%02d' % (video_id, index),
304 'title': video_title,
307 files_info.append(info)
312 class XNXXIE(InfoExtractor):
313 """Information extractor for xnxx.com"""
315 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
317 VIDEO_URL_RE = r'flv_url=(.*?)&'
318 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
319 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
321 def _real_extract(self, url):
322 mobj = re.match(self._VALID_URL, url)
324 raise ExtractorError(u'Invalid URL: %s' % url)
325 video_id = mobj.group(1)
327 # Get webpage content
328 webpage = self._download_webpage(url, video_id)
330 video_url = self._search_regex(self.VIDEO_URL_RE,
331 webpage, u'video URL')
332 video_url = compat_urllib_parse.unquote(video_url)
334 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
337 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
338 webpage, u'thumbnail', fatal=False)
345 'title': video_title,
347 'thumbnail': video_thumbnail,
354 class JustinTVIE(InfoExtractor):
355 """Information extractor for justin.tv and twitch.tv"""
356 # TODO: One broadcast may be split into multiple videos. The key
357 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
358 # starts at 1 and increases. Can we treat all parts as one video?
360 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
362 (?P<channelid>[^/]+)|
363 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
364 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
368 _JUSTIN_PAGE_LIMIT = 100
369 IE_NAME = u'justin.tv'
371 def report_download_page(self, channel, offset):
372 """Report attempt to download a single page of videos."""
373 self.to_screen(u'%s: Downloading video information from %d to %d' %
374 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
376 # Return count of items, list of *valid* items
377 def _parse_page(self, url, video_id):
378 webpage = self._download_webpage(url, video_id,
379 u'Downloading video info JSON',
380 u'unable to download video info JSON')
382 response = json.loads(webpage)
383 if type(response) != list:
384 error_text = response.get('error', 'unknown error')
385 raise ExtractorError(u'Justin.tv API: %s' % error_text)
387 for clip in response:
388 video_url = clip['video_file_url']
390 video_extension = os.path.splitext(video_url)[1][1:]
391 video_date = re.sub('-', '', clip['start_time'][:10])
392 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
393 video_id = clip['id']
394 video_title = clip.get('title', video_id)
398 'title': video_title,
399 'uploader': clip.get('channel_name', video_uploader_id),
400 'uploader_id': video_uploader_id,
401 'upload_date': video_date,
402 'ext': video_extension,
404 return (len(response), info)
406 def _real_extract(self, url):
407 mobj = re.match(self._VALID_URL, url)
409 raise ExtractorError(u'invalid URL: %s' % url)
411 api_base = 'http://api.justin.tv'
413 if mobj.group('channelid'):
415 video_id = mobj.group('channelid')
416 api = api_base + '/channel/archives/%s.json' % video_id
417 elif mobj.group('chapterid'):
418 chapter_id = mobj.group('chapterid')
420 webpage = self._download_webpage(url, chapter_id)
421 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
423 raise ExtractorError(u'Cannot find archive of a chapter')
424 archive_id = m.group(1)
426 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
427 chapter_info_xml = self._download_webpage(api, chapter_id,
428 note=u'Downloading chapter information',
429 errnote=u'Chapter information download failed')
430 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
431 for a in doc.findall('.//archive'):
432 if archive_id == a.find('./id').text:
435 raise ExtractorError(u'Could not find chapter in chapter information')
437 video_url = a.find('./video_file_url').text
438 video_ext = video_url.rpartition('.')[2] or u'flv'
440 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
441 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
442 note='Downloading chapter metadata',
443 errnote='Download of chapter metadata failed')
444 chapter_info = json.loads(chapter_info_json)
446 bracket_start = int(doc.find('.//bracket_start').text)
447 bracket_end = int(doc.find('.//bracket_end').text)
449 # TODO determine start (and probably fix up file)
450 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
451 #video_url += u'?start=' + TODO:start_timestamp
452 # bracket_start is 13290, but we want 51670615
453 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
454 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
457 'id': u'c' + chapter_id,
460 'title': chapter_info['title'],
461 'thumbnail': chapter_info['preview'],
462 'description': chapter_info['description'],
463 'uploader': chapter_info['channel']['display_name'],
464 'uploader_id': chapter_info['channel']['name'],
468 video_id = mobj.group('videoid')
469 api = api_base + '/broadcast/by_archive/%s.json' % video_id
471 self.report_extraction(video_id)
475 limit = self._JUSTIN_PAGE_LIMIT
478 self.report_download_page(video_id, offset)
479 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
480 page_count, page_info = self._parse_page(page_url, video_id)
481 info.extend(page_info)
482 if not paged or page_count != limit:
487 class FunnyOrDieIE(InfoExtractor):
488 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
490 def _real_extract(self, url):
491 mobj = re.match(self._VALID_URL, url)
493 raise ExtractorError(u'invalid URL: %s' % url)
495 video_id = mobj.group('id')
496 webpage = self._download_webpage(url, video_id)
498 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
499 webpage, u'video URL', flags=re.DOTALL)
501 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
502 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
504 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
505 webpage, u'description', fatal=False, flags=re.DOTALL)
512 'description': video_description,
516 class SteamIE(InfoExtractor):
517 _VALID_URL = r"""http://store\.steampowered\.com/
519 (?P<urltype>video|app)/ #If the page is only for videos or for a game
521 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
523 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
524 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
527 def suitable(cls, url):
528 """Receives a URL and returns True if suitable for this IE."""
529 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
531 def _real_extract(self, url):
532 m = re.match(self._VALID_URL, url, re.VERBOSE)
533 gameID = m.group('gameID')
535 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
536 webpage = self._download_webpage(videourl, gameID)
538 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
539 videourl = self._AGECHECK_TEMPLATE % gameID
540 self.report_age_confirmation()
541 webpage = self._download_webpage(videourl, gameID)
543 self.report_extraction(gameID)
544 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
545 webpage, 'game title')
547 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
548 mweb = re.finditer(urlRE, webpage)
549 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
550 titles = re.finditer(namesRE, webpage)
551 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
552 thumbs = re.finditer(thumbsRE, webpage)
554 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
555 video_id = vid.group('videoID')
556 title = vtitle.group('videoName')
557 video_url = vid.group('videoURL')
558 video_thumb = thumb.group('thumbnail')
560 raise ExtractorError(u'Cannot find video url for %s' % video_id)
565 'title': unescapeHTML(title),
566 'thumbnail': video_thumb
569 return [self.playlist_result(videos, gameID, game_title)]
571 class UstreamIE(InfoExtractor):
572 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
575 def _real_extract(self, url):
576 m = re.match(self._VALID_URL, url)
577 video_id = m.group('videoID')
579 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
580 webpage = self._download_webpage(url, video_id)
582 self.report_extraction(video_id)
584 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
587 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
588 webpage, u'uploader', fatal=False, flags=re.DOTALL)
590 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
591 webpage, u'thumbnail', fatal=False)
597 'title': video_title,
598 'uploader': uploader,
599 'thumbnail': thumbnail,
603 class WorldStarHipHopIE(InfoExtractor):
604 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
605 IE_NAME = u'WorldStarHipHop'
607 def _real_extract(self, url):
608 m = re.match(self._VALID_URL, url)
609 video_id = m.group('id')
611 webpage_src = self._download_webpage(url, video_id)
613 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
614 webpage_src, u'video URL')
616 if 'mp4' in video_url:
621 video_title = self._html_search_regex(r"<title>(.*)</title>",
622 webpage_src, u'title')
624 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
625 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
626 webpage_src, u'thumbnail', fatal=False)
629 _title = r"""candytitles.*>(.*)</span>"""
630 mobj = re.search(_title, webpage_src)
632 video_title = mobj.group(1)
637 'title' : video_title,
638 'thumbnail' : thumbnail,
643 class RBMARadioIE(InfoExtractor):
644 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
646 def _real_extract(self, url):
647 m = re.match(self._VALID_URL, url)
648 video_id = m.group('videoID')
650 webpage = self._download_webpage(url, video_id)
652 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
653 webpage, u'json data', flags=re.MULTILINE)
656 data = json.loads(json_data)
657 except ValueError as e:
658 raise ExtractorError(u'Invalid JSON: ' + str(e))
660 video_url = data['akamai_url'] + '&cbr=256'
661 url_parts = compat_urllib_parse_urlparse(video_url)
662 video_ext = url_parts.path.rpartition('.')[2]
667 'title': data['title'],
668 'description': data.get('teaser_text'),
669 'location': data.get('country_of_origin'),
670 'uploader': data.get('host', {}).get('name'),
671 'uploader_id': data.get('host', {}).get('slug'),
672 'thumbnail': data.get('image', {}).get('large_url_2x'),
673 'duration': data.get('duration'),
678 class YouPornIE(InfoExtractor):
679 """Information extractor for youporn.com."""
680 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
682 def _print_formats(self, formats):
683 """Print all available formats"""
684 print(u'Available formats:')
685 print(u'ext\t\tformat')
686 print(u'---------------------------------')
687 for format in formats:
688 print(u'%s\t\t%s' % (format['ext'], format['format']))
690 def _specific(self, req_format, formats):
692 if(x["format"]==req_format):
696 def _real_extract(self, url):
697 mobj = re.match(self._VALID_URL, url)
699 raise ExtractorError(u'Invalid URL: %s' % url)
700 video_id = mobj.group('videoid')
702 req = compat_urllib_request.Request(url)
703 req.add_header('Cookie', 'age_verified=1')
704 webpage = self._download_webpage(req, video_id)
706 # Get JSON parameters
707 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
709 params = json.loads(json_params)
711 raise ExtractorError(u'Invalid JSON')
713 self.report_extraction(video_id)
715 video_title = params['title']
716 upload_date = unified_strdate(params['release_date_f'])
717 video_description = params['description']
718 video_uploader = params['submitted_by']
719 thumbnail = params['thumbnails'][0]['image']
721 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
723 # Get all of the formats available
724 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
725 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
726 webpage, u'download list').strip()
728 # Get all of the links from the page
729 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
730 links = re.findall(LINK_RE, download_list_html)
732 raise ExtractorError(u'ERROR: no known formats available for video')
734 self.to_screen(u'Links found: %d' % len(links))
739 # A link looks like this:
740 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
741 # A path looks like this:
742 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
743 video_url = unescapeHTML( link )
744 path = compat_urllib_parse_urlparse( video_url ).path
745 extension = os.path.splitext( path )[1][1:]
746 format = path.split('/')[4].split('_')[:2]
749 format = "-".join( format )
750 # title = u'%s-%s-%s' % (video_title, size, bitrate)
755 'uploader': video_uploader,
756 'upload_date': upload_date,
757 'title': video_title,
760 'thumbnail': thumbnail,
761 'description': video_description
764 if self._downloader.params.get('listformats', None):
765 self._print_formats(formats)
768 req_format = self._downloader.params.get('format', None)
769 self.to_screen(u'Format: %s' % req_format)
771 if req_format is None or req_format == 'best':
773 elif req_format == 'worst':
775 elif req_format in ('-1', 'all'):
778 format = self._specific( req_format, formats )
780 raise ExtractorError(u'Requested format not available')
785 class PornotubeIE(InfoExtractor):
786 """Information extractor for pornotube.com."""
787 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
789 def _real_extract(self, url):
790 mobj = re.match(self._VALID_URL, url)
792 raise ExtractorError(u'Invalid URL: %s' % url)
794 video_id = mobj.group('videoid')
795 video_title = mobj.group('title')
797 # Get webpage content
798 webpage = self._download_webpage(url, video_id)
801 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
802 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
803 video_url = compat_urllib_parse.unquote(video_url)
805 #Get the uploaded date
806 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
807 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
808 if upload_date: upload_date = unified_strdate(upload_date)
810 info = {'id': video_id,
813 'upload_date': upload_date,
814 'title': video_title,
820 class YouJizzIE(InfoExtractor):
821 """Information extractor for youjizz.com."""
822 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
824 def _real_extract(self, url):
825 mobj = re.match(self._VALID_URL, url)
827 raise ExtractorError(u'Invalid URL: %s' % url)
829 video_id = mobj.group('videoid')
831 # Get webpage content
832 webpage = self._download_webpage(url, video_id)
834 # Get the video title
835 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
836 webpage, u'title').strip()
839 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
841 raise ExtractorError(u'ERROR: unable to extract embed page')
843 embed_page_url = result.group(0).strip()
844 video_id = result.group('videoid')
846 webpage = self._download_webpage(embed_page_url, video_id)
849 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
850 webpage, u'video URL')
852 info = {'id': video_id,
854 'title': video_title,
857 'player_url': embed_page_url}
861 class EightTracksIE(InfoExtractor):
863 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
865 def _real_extract(self, url):
866 mobj = re.match(self._VALID_URL, url)
868 raise ExtractorError(u'Invalid URL: %s' % url)
869 playlist_id = mobj.group('id')
871 webpage = self._download_webpage(url, playlist_id)
873 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
874 data = json.loads(json_like)
876 session = str(random.randint(0, 1000000000))
878 track_count = data['tracks_count']
879 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
882 for i in itertools.count():
883 api_json = self._download_webpage(next_url, playlist_id,
884 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
885 errnote=u'Failed to download song information')
886 api_data = json.loads(api_json)
887 track_data = api_data[u'set']['track']
889 'id': track_data['id'],
890 'url': track_data['track_file_stream_url'],
891 'title': track_data['performer'] + u' - ' + track_data['name'],
892 'raw_title': track_data['name'],
893 'uploader_id': data['user']['login'],
897 if api_data['set']['at_last_track']:
899 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
902 class KeekIE(InfoExtractor):
903 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
906 def _real_extract(self, url):
907 m = re.match(self._VALID_URL, url)
908 video_id = m.group('videoID')
910 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
911 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
912 webpage = self._download_webpage(url, video_id)
914 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
917 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
918 webpage, u'uploader', fatal=False)
924 'title': video_title,
925 'thumbnail': thumbnail,
930 class TEDIE(InfoExtractor):
931 _VALID_URL=r'''http://www\.ted\.com/
933 ((?P<type_playlist>playlists)/(?P<playlist_id>\d+)) # We have a playlist
935 ((?P<type_talk>talks)) # We have a simple talk
937 (/lang/(.*?))? # The url may contain the language
938 /(?P<name>\w+) # Here goes the name and then ".html"
942 def suitable(cls, url):
943 """Receives a URL and returns True if suitable for this IE."""
944 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
946 def _real_extract(self, url):
947 m=re.match(self._VALID_URL, url, re.VERBOSE)
948 if m.group('type_talk'):
949 return [self._talk_info(url)]
951 playlist_id=m.group('playlist_id')
953 self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name))
954 return [self._playlist_videos_info(url,name,playlist_id)]
956 def _playlist_videos_info(self,url,name,playlist_id=0):
957 '''Returns the videos of the playlist'''
959 <li\ id="talk_(\d+)"([.\s]*?)data-id="(?P<video_id>\d+)"
960 ([.\s]*?)data-playlist_item_id="(\d+)"
961 ([.\s]*?)data-mediaslug="(?P<mediaSlug>.+?)"
963 video_name_RE=r'<p\ class="talk-title"><a href="(?P<talk_url>/talks/(.+).html)">(?P<fullname>.+?)</a></p>'
964 webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage')
965 m_videos=re.finditer(video_RE,webpage,re.VERBOSE)
966 m_names=re.finditer(video_name_RE,webpage)
968 playlist_title = self._html_search_regex(r'div class="headline">\s*?<h1>\s*?<span>(.*?)</span>',
969 webpage, 'playlist title')
971 playlist_entries = []
972 for m_video, m_name in zip(m_videos,m_names):
973 video_id=m_video.group('video_id')
974 talk_url='http://www.ted.com%s' % m_name.group('talk_url')
975 playlist_entries.append(self.url_result(talk_url, 'TED'))
976 return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title)
978 def _talk_info(self, url, video_id=0):
979 """Return the video for the talk in the url"""
980 m = re.match(self._VALID_URL, url,re.VERBOSE)
981 video_name = m.group('name')
982 webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
983 self.report_extraction(video_name)
984 # If the url includes the language we get the title translated
985 title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
987 json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
988 webpage, 'json data')
989 info = json.loads(json_data)
990 desc = self._html_search_regex(r'<div class="talk-intro">.*?<p.*?>(.*?)</p>',
991 webpage, 'description', flags = re.DOTALL)
993 thumbnail = self._search_regex(r'</span>[\s.]*</div>[\s.]*<img src="(.*?)"',
994 webpage, 'thumbnail')
997 'url': info['htmlStreams'][-1]['file'],
1000 'thumbnail': thumbnail,
1001 'description': desc,
1005 class MySpassIE(InfoExtractor):
1006 _VALID_URL = r'http://www.myspass.de/.*'
1008 def _real_extract(self, url):
1009 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
1011 # video id is the last path element of the URL
1012 # usually there is a trailing slash, so also try the second but last
1013 url_path = compat_urllib_parse_urlparse(url).path
1014 url_parent_path, video_id = os.path.split(url_path)
1016 _, video_id = os.path.split(url_parent_path)
1019 metadata_url = META_DATA_URL_TEMPLATE % video_id
1020 metadata_text = self._download_webpage(metadata_url, video_id)
1021 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
1023 # extract values from metadata
1024 url_flv_el = metadata.find('url_flv')
1025 if url_flv_el is None:
1026 raise ExtractorError(u'Unable to extract download url')
1027 video_url = url_flv_el.text
1028 extension = os.path.splitext(video_url)[1][1:]
1029 title_el = metadata.find('title')
1030 if title_el is None:
1031 raise ExtractorError(u'Unable to extract title')
1032 title = title_el.text
1033 format_id_el = metadata.find('format_id')
1034 if format_id_el is None:
1037 format = format_id_el.text
1038 description_el = metadata.find('description')
1039 if description_el is not None:
1040 description = description_el.text
1043 imagePreview_el = metadata.find('imagePreview')
1044 if imagePreview_el is not None:
1045 thumbnail = imagePreview_el.text
1054 'thumbnail': thumbnail,
1055 'description': description
1059 class SpiegelIE(InfoExtractor):
1060 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
1062 def _real_extract(self, url):
1063 m = re.match(self._VALID_URL, url)
1064 video_id = m.group('videoID')
1066 webpage = self._download_webpage(url, video_id)
1068 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
1071 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
1072 xml_code = self._download_webpage(xml_url, video_id,
1073 note=u'Downloading XML', errnote=u'Failed to download XML')
1075 idoc = xml.etree.ElementTree.fromstring(xml_code)
1076 last_type = idoc[-1]
1077 filename = last_type.findall('./filename')[0].text
1078 duration = float(last_type.findall('./duration')[0].text)
1080 video_url = 'http://video2.spiegel.de/flash/' + filename
1081 video_ext = filename.rpartition('.')[2]
1086 'title': video_title,
1087 'duration': duration,
1091 class LiveLeakIE(InfoExtractor):
1093 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
1094 IE_NAME = u'liveleak'
1096 def _real_extract(self, url):
1097 mobj = re.match(self._VALID_URL, url)
1099 raise ExtractorError(u'Invalid URL: %s' % url)
1101 video_id = mobj.group('video_id')
1103 webpage = self._download_webpage(url, video_id)
1105 video_url = self._search_regex(r'file: "(.*?)",',
1106 webpage, u'video URL')
1108 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
1109 webpage, u'title').replace('LiveLeak.com -', '').strip()
1111 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
1112 webpage, u'description', fatal=False)
1114 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
1115 webpage, u'uploader', fatal=False)
1121 'title': video_title,
1122 'description': video_description,
1123 'uploader': video_uploader
1130 class TumblrIE(InfoExtractor):
1131 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
1133 def _real_extract(self, url):
1134 m_url = re.match(self._VALID_URL, url)
1135 video_id = m_url.group('id')
1136 blog = m_url.group('blog_name')
1138 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
1139 webpage = self._download_webpage(url, video_id)
1141 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
1142 video = re.search(re_video, webpage)
1144 raise ExtractorError(u'Unable to extract video')
1145 video_url = video.group('video_url')
1146 ext = video.group('ext')
1148 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
1149 webpage, u'thumbnail', fatal=False) # We pick the first poster
1150 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
1152 # The only place where you can get a title, it's not complete,
1153 # but searching in other places doesn't work for all videos
1154 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
1155 webpage, u'title', flags=re.DOTALL)
1157 return [{'id': video_id,
1159 'title': video_title,
1160 'thumbnail': video_thumbnail,
1164 class BandcampIE(InfoExtractor):
1165 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
1167 def _real_extract(self, url):
1168 mobj = re.match(self._VALID_URL, url)
1169 title = mobj.group('title')
1170 webpage = self._download_webpage(url, title)
1171 # We get the link to the free download page
1172 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
1173 if m_download is None:
1174 raise ExtractorError(u'No free songs found')
1176 download_link = m_download.group(1)
1177 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
1178 webpage, re.MULTILINE|re.DOTALL).group('id')
1180 download_webpage = self._download_webpage(download_link, id,
1181 'Downloading free downloads page')
1182 # We get the dictionary of the track from some javascrip code
1183 info = re.search(r'items: (.*?),$',
1184 download_webpage, re.MULTILINE).group(1)
1185 info = json.loads(info)[0]
1186 # We pick mp3-320 for now, until format selection can be easily implemented.
1187 mp3_info = info[u'downloads'][u'mp3-320']
1188 # If we try to use this url it says the link has expired
1189 initial_url = mp3_info[u'url']
1190 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
1191 m_url = re.match(re_url, initial_url)
1192 #We build the url we will use to get the final track url
1193 # This url is build in Bandcamp in the script download_bunde_*.js
1194 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
1195 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
1196 # If we could correctly generate the .rand field the url would be
1197 #in the "download_url" key
1198 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
1200 track_info = {'id':id,
1201 'title' : info[u'title'],
1204 'thumbnail' : info[u'thumb_url'],
1205 'uploader' : info[u'artist']
1210 class RedTubeIE(InfoExtractor):
1211 """Information Extractor for redtube"""
1212 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
1214 def _real_extract(self,url):
1215 mobj = re.match(self._VALID_URL, url)
1217 raise ExtractorError(u'Invalid URL: %s' % url)
1219 video_id = mobj.group('id')
1220 video_extension = 'mp4'
1221 webpage = self._download_webpage(url, video_id)
1223 self.report_extraction(video_id)
1225 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
1226 webpage, u'video URL')
1228 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
1234 'ext': video_extension,
1235 'title': video_title,
1238 class InaIE(InfoExtractor):
1239 """Information Extractor for Ina.fr"""
1240 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1242 def _real_extract(self,url):
1243 mobj = re.match(self._VALID_URL, url)
1245 video_id = mobj.group('id')
1246 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1247 video_extension = 'mp4'
1248 webpage = self._download_webpage(mrss_url, video_id)
1250 self.report_extraction(video_id)
1252 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1253 webpage, u'video URL')
1255 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1261 'ext': video_extension,
1262 'title': video_title,
1265 class HowcastIE(InfoExtractor):
1266 """Information Extractor for Howcast.com"""
1267 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1269 def _real_extract(self, url):
1270 mobj = re.match(self._VALID_URL, url)
1272 video_id = mobj.group('id')
1273 webpage_url = 'http://www.howcast.com/videos/' + video_id
1274 webpage = self._download_webpage(webpage_url, video_id)
1276 self.report_extraction(video_id)
1278 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1279 webpage, u'video URL')
1281 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1284 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1285 webpage, u'description', fatal=False)
1287 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1288 webpage, u'thumbnail', fatal=False)
1294 'title': video_title,
1295 'description': video_description,
1296 'thumbnail': thumbnail,
1299 class VineIE(InfoExtractor):
1300 """Information Extractor for Vine.co"""
1301 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1303 def _real_extract(self, url):
1304 mobj = re.match(self._VALID_URL, url)
1306 video_id = mobj.group('id')
1307 webpage_url = 'https://vine.co/v/' + video_id
1308 webpage = self._download_webpage(webpage_url, video_id)
1310 self.report_extraction(video_id)
1312 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1313 webpage, u'video URL')
1315 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1318 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
1319 webpage, u'thumbnail', fatal=False)
1321 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
1322 webpage, u'uploader', fatal=False, flags=re.DOTALL)
1328 'title': video_title,
1329 'thumbnail': thumbnail,
1330 'uploader': uploader,
1333 class FlickrIE(InfoExtractor):
1334 """Information Extractor for Flickr videos"""
1335 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1337 def _real_extract(self, url):
1338 mobj = re.match(self._VALID_URL, url)
1340 video_id = mobj.group('id')
1341 video_uploader_id = mobj.group('uploader_id')
1342 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1343 webpage = self._download_webpage(webpage_url, video_id)
1345 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
1347 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1348 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1350 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1351 first_xml, u'node_id')
1353 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1354 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1356 self.report_extraction(video_id)
1358 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
1360 raise ExtractorError(u'Unable to extract video url')
1361 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1363 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
1364 webpage, u'video title')
1366 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
1367 webpage, u'description', fatal=False)
1369 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
1370 webpage, u'thumbnail', fatal=False)
1376 'title': video_title,
1377 'description': video_description,
1378 'thumbnail': thumbnail,
1379 'uploader_id': video_uploader_id,
1382 class TeamcocoIE(InfoExtractor):
1383 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
1385 def _real_extract(self, url):
1386 mobj = re.match(self._VALID_URL, url)
1388 raise ExtractorError(u'Invalid URL: %s' % url)
1389 url_title = mobj.group('url_title')
1390 webpage = self._download_webpage(url, url_title)
1392 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
1393 webpage, u'video id')
1395 self.report_extraction(video_id)
1397 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1400 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
1401 webpage, u'thumbnail', fatal=False)
1403 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
1404 webpage, u'description', fatal=False)
1406 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
1407 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
1409 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
1416 'title': video_title,
1417 'thumbnail': thumbnail,
1418 'description': video_description,
1421 class XHamsterIE(InfoExtractor):
1422 """Information Extractor for xHamster"""
1423 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
1425 def _real_extract(self,url):
1426 mobj = re.match(self._VALID_URL, url)
1428 video_id = mobj.group('id')
1429 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
1430 webpage = self._download_webpage(mrss_url, video_id)
1432 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
1434 raise ExtractorError(u'Unable to extract media URL')
1435 if len(mobj.group('server')) == 0:
1436 video_url = compat_urllib_parse.unquote(mobj.group('file'))
1438 video_url = mobj.group('server')+'/key='+mobj.group('file')
1439 video_extension = video_url.split('.')[-1]
1441 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
1444 # Can't see the description anywhere in the UI
1445 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1446 # webpage, u'description', fatal=False)
1447 # if video_description: video_description = unescapeHTML(video_description)
1449 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
1451 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
1453 video_upload_date = None
1454 self._downloader.report_warning(u'Unable to extract upload date')
1456 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1457 webpage, u'uploader id', default=u'anonymous')
1459 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1460 webpage, u'thumbnail', fatal=False)
1465 'ext': video_extension,
1466 'title': video_title,
1467 # 'description': video_description,
1468 'upload_date': video_upload_date,
1469 'uploader_id': video_uploader_id,
1470 'thumbnail': video_thumbnail
1473 class HypemIE(InfoExtractor):
1474 """Information Extractor for hypem"""
1475 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1477 def _real_extract(self, url):
1478 mobj = re.match(self._VALID_URL, url)
1480 raise ExtractorError(u'Invalid URL: %s' % url)
1481 track_id = mobj.group(1)
1483 data = { 'ax': 1, 'ts': time.time() }
1484 data_encoded = compat_urllib_parse.urlencode(data)
1485 complete_url = url + "?" + data_encoded
1486 request = compat_urllib_request.Request(complete_url)
1487 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1488 cookie = urlh.headers.get('Set-Cookie', '')
1490 self.report_extraction(track_id)
1492 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1493 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1495 track_list = json.loads(html_tracks)
1496 track = track_list[u'tracks'][0]
1498 raise ExtractorError(u'Hypemachine contained invalid JSON.')
1501 track_id = track[u"id"]
1502 artist = track[u"artist"]
1503 title = track[u"song"]
1505 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1506 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1507 request.add_header('cookie', cookie)
1508 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1510 song_data = json.loads(song_data_json)
1512 raise ExtractorError(u'Hypemachine contained invalid JSON.')
1513 final_url = song_data[u"url"]
1523 class Vbox7IE(InfoExtractor):
1524 """Information Extractor for Vbox7"""
1525 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1527 def _real_extract(self,url):
1528 mobj = re.match(self._VALID_URL, url)
1530 raise ExtractorError(u'Invalid URL: %s' % url)
1531 video_id = mobj.group(1)
1533 redirect_page, urlh = self._download_webpage_handle(url, video_id)
1534 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1535 redirect_url = urlh.geturl() + new_location
1536 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1538 title = self._html_search_regex(r'<title>(.*)</title>',
1539 webpage, u'title').split('/')[0].strip()
1542 info_url = "http://vbox7.com/play/magare.do"
1543 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1544 info_request = compat_urllib_request.Request(info_url, data)
1545 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1546 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1547 if info_response is None:
1548 raise ExtractorError(u'Unable to extract the media url')
1549 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1556 'thumbnail': thumbnail_url,
1560 def gen_extractors():
1561 """ Return a list of an instance of every supported extractor.
1562 The order does matter; the first extractor matched is the one handling the URL.
1565 YoutubePlaylistIE(),
1590 StanfordOpenClassroomIE(),
1600 WorldStarHipHopIE(),
1630 def get_info_extractor(ie_name):
1631 """Returns the info extractor class with the given ie_name"""
1632 return globals()[ie_name+'IE']