10 import xml.etree.ElementTree
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.collegehumor import CollegeHumorIE
26 from .extractor.dailymotion import DailymotionIE
27 from .extractor.depositfiles import DepositFilesIE
28 from .extractor.escapist import EscapistIE
29 from .extractor.facebook import FacebookIE
30 from .extractor.gametrailers import GametrailersIE
31 from .extractor.generic import GenericIE
32 from .extractor.googleplus import GooglePlusIE
33 from .extractor.googlesearch import GoogleSearchIE
34 from .extractor.infoq import InfoQIE
35 from .extractor.metacafe import MetacafeIE
36 from .extractor.mixcloud import MixcloudIE
37 from .extractor.mtv import MTVIE
38 from .extractor.myvideo import MyVideoIE
39 from .extractor.nba import NBAIE
40 from .extractor.statigram import StatigramIE
41 from .extractor.photobucket import PhotobucketIE
42 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
43 from .extractor.stanfordoc import StanfordOpenClassroomIE
44 from .extractor.ted import TEDIE
45 from .extractor.vimeo import VimeoIE
46 from .extractor.xvideos import XVideosIE
47 from .extractor.yahoo import YahooIE, YahooSearchIE
48 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
49 from .extractor.zdf import ZDFIE
57 class YoukuIE(InfoExtractor):
58 _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
61 nowTime = int(time.time() * 1000)
62 random1 = random.randint(1000,1998)
63 random2 = random.randint(1000,9999)
65 return "%d%d%d" %(nowTime,random1,random2)
67 def _get_file_ID_mix_string(self, seed):
69 source = list("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ/\:._-1234567890")
71 for i in range(len(source)):
72 seed = (seed * 211 + 30031 ) % 65536
73 index = math.floor(seed / 65536 * len(source) )
74 mixed.append(source[int(index)])
75 source.remove(source[int(index)])
76 #return ''.join(mixed)
79 def _get_file_id(self, fileId, seed):
80 mixed = self._get_file_ID_mix_string(seed)
81 ids = fileId.split('*')
85 realId.append(mixed[int(ch)])
86 return ''.join(realId)
88 def _real_extract(self, url):
89 mobj = re.match(self._VALID_URL, url)
91 raise ExtractorError(u'Invalid URL: %s' % url)
92 video_id = mobj.group('ID')
94 info_url = 'http://v.youku.com/player/getPlayList/VideoIDS/' + video_id
96 jsondata = self._download_webpage(info_url, video_id)
98 self.report_extraction(video_id)
100 config = json.loads(jsondata)
102 video_title = config['data'][0]['title']
103 seed = config['data'][0]['seed']
105 format = self._downloader.params.get('format', None)
106 supported_format = list(config['data'][0]['streamfileids'].keys())
108 if format is None or format == 'best':
109 if 'hd2' in supported_format:
114 elif format == 'worst':
122 fileid = config['data'][0]['streamfileids'][format]
123 keys = [s['k'] for s in config['data'][0]['segs'][format]]
124 except (UnicodeDecodeError, ValueError, KeyError):
125 raise ExtractorError(u'Unable to extract info section')
128 sid = self._gen_sid()
129 fileid = self._get_file_id(fileid, seed)
131 #column 8,9 of fileid represent the segment number
132 #fileid[7:9] should be changed
133 for index, key in enumerate(keys):
135 temp_fileid = '%s%02X%s' % (fileid[0:8], index, fileid[10:])
136 download_url = 'http://f.youku.com/player/getFlvPath/sid/%s_%02X/st/flv/fileid/%s?k=%s' % (sid, index, temp_fileid, key)
139 'id': '%s_part%02d' % (video_id, index),
143 'title': video_title,
146 files_info.append(info)
151 class XNXXIE(InfoExtractor):
152 """Information extractor for xnxx.com"""
154 _VALID_URL = r'^(?:https?://)?video\.xnxx\.com/video([0-9]+)/(.*)'
156 VIDEO_URL_RE = r'flv_url=(.*?)&'
157 VIDEO_TITLE_RE = r'<title>(.*?)\s+-\s+XNXX.COM'
158 VIDEO_THUMB_RE = r'url_bigthumb=(.*?)&'
160 def _real_extract(self, url):
161 mobj = re.match(self._VALID_URL, url)
163 raise ExtractorError(u'Invalid URL: %s' % url)
164 video_id = mobj.group(1)
166 # Get webpage content
167 webpage = self._download_webpage(url, video_id)
169 video_url = self._search_regex(self.VIDEO_URL_RE,
170 webpage, u'video URL')
171 video_url = compat_urllib_parse.unquote(video_url)
173 video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
176 video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
177 webpage, u'thumbnail', fatal=False)
184 'title': video_title,
186 'thumbnail': video_thumbnail,
193 class JustinTVIE(InfoExtractor):
194 """Information extractor for justin.tv and twitch.tv"""
195 # TODO: One broadcast may be split into multiple videos. The key
196 # 'broadcast_id' is the same for all parts, and 'broadcast_part'
197 # starts at 1 and increases. Can we treat all parts as one video?
199 _VALID_URL = r"""(?x)^(?:http://)?(?:www\.)?(?:twitch|justin)\.tv/
201 (?P<channelid>[^/]+)|
202 (?:(?:[^/]+)/b/(?P<videoid>[^/]+))|
203 (?:(?:[^/]+)/c/(?P<chapterid>[^/]+))
207 _JUSTIN_PAGE_LIMIT = 100
208 IE_NAME = u'justin.tv'
210 def report_download_page(self, channel, offset):
211 """Report attempt to download a single page of videos."""
212 self.to_screen(u'%s: Downloading video information from %d to %d' %
213 (channel, offset, offset + self._JUSTIN_PAGE_LIMIT))
215 # Return count of items, list of *valid* items
216 def _parse_page(self, url, video_id):
217 webpage = self._download_webpage(url, video_id,
218 u'Downloading video info JSON',
219 u'unable to download video info JSON')
221 response = json.loads(webpage)
222 if type(response) != list:
223 error_text = response.get('error', 'unknown error')
224 raise ExtractorError(u'Justin.tv API: %s' % error_text)
226 for clip in response:
227 video_url = clip['video_file_url']
229 video_extension = os.path.splitext(video_url)[1][1:]
230 video_date = re.sub('-', '', clip['start_time'][:10])
231 video_uploader_id = clip.get('user_id', clip.get('channel_id'))
232 video_id = clip['id']
233 video_title = clip.get('title', video_id)
237 'title': video_title,
238 'uploader': clip.get('channel_name', video_uploader_id),
239 'uploader_id': video_uploader_id,
240 'upload_date': video_date,
241 'ext': video_extension,
243 return (len(response), info)
245 def _real_extract(self, url):
246 mobj = re.match(self._VALID_URL, url)
248 raise ExtractorError(u'invalid URL: %s' % url)
250 api_base = 'http://api.justin.tv'
252 if mobj.group('channelid'):
254 video_id = mobj.group('channelid')
255 api = api_base + '/channel/archives/%s.json' % video_id
256 elif mobj.group('chapterid'):
257 chapter_id = mobj.group('chapterid')
259 webpage = self._download_webpage(url, chapter_id)
260 m = re.search(r'PP\.archive_id = "([0-9]+)";', webpage)
262 raise ExtractorError(u'Cannot find archive of a chapter')
263 archive_id = m.group(1)
265 api = api_base + '/broadcast/by_chapter/%s.xml' % chapter_id
266 chapter_info_xml = self._download_webpage(api, chapter_id,
267 note=u'Downloading chapter information',
268 errnote=u'Chapter information download failed')
269 doc = xml.etree.ElementTree.fromstring(chapter_info_xml)
270 for a in doc.findall('.//archive'):
271 if archive_id == a.find('./id').text:
274 raise ExtractorError(u'Could not find chapter in chapter information')
276 video_url = a.find('./video_file_url').text
277 video_ext = video_url.rpartition('.')[2] or u'flv'
279 chapter_api_url = u'https://api.twitch.tv/kraken/videos/c' + chapter_id
280 chapter_info_json = self._download_webpage(chapter_api_url, u'c' + chapter_id,
281 note='Downloading chapter metadata',
282 errnote='Download of chapter metadata failed')
283 chapter_info = json.loads(chapter_info_json)
285 bracket_start = int(doc.find('.//bracket_start').text)
286 bracket_end = int(doc.find('.//bracket_end').text)
288 # TODO determine start (and probably fix up file)
289 # youtube-dl -v http://www.twitch.tv/firmbelief/c/1757457
290 #video_url += u'?start=' + TODO:start_timestamp
291 # bracket_start is 13290, but we want 51670615
292 self._downloader.report_warning(u'Chapter detected, but we can just download the whole file. '
293 u'Chapter starts at %s and ends at %s' % (formatSeconds(bracket_start), formatSeconds(bracket_end)))
296 'id': u'c' + chapter_id,
299 'title': chapter_info['title'],
300 'thumbnail': chapter_info['preview'],
301 'description': chapter_info['description'],
302 'uploader': chapter_info['channel']['display_name'],
303 'uploader_id': chapter_info['channel']['name'],
307 video_id = mobj.group('videoid')
308 api = api_base + '/broadcast/by_archive/%s.json' % video_id
310 self.report_extraction(video_id)
314 limit = self._JUSTIN_PAGE_LIMIT
317 self.report_download_page(video_id, offset)
318 page_url = api + ('?offset=%d&limit=%d' % (offset, limit))
319 page_count, page_info = self._parse_page(page_url, video_id)
320 info.extend(page_info)
321 if not paged or page_count != limit:
326 class FunnyOrDieIE(InfoExtractor):
327 _VALID_URL = r'^(?:https?://)?(?:www\.)?funnyordie\.com/videos/(?P<id>[0-9a-f]+)/.*$'
329 def _real_extract(self, url):
330 mobj = re.match(self._VALID_URL, url)
332 raise ExtractorError(u'invalid URL: %s' % url)
334 video_id = mobj.group('id')
335 webpage = self._download_webpage(url, video_id)
337 video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
338 webpage, u'video URL', flags=re.DOTALL)
340 title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
341 r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
343 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
344 webpage, u'description', fatal=False, flags=re.DOTALL)
351 'description': video_description,
355 class SteamIE(InfoExtractor):
356 _VALID_URL = r"""http://store\.steampowered\.com/
358 (?P<urltype>video|app)/ #If the page is only for videos or for a game
360 (?P<videoID>\d*)(?P<extra>\??) #For urltype == video we sometimes get the videoID
362 _VIDEO_PAGE_TEMPLATE = 'http://store.steampowered.com/video/%s/'
363 _AGECHECK_TEMPLATE = 'http://store.steampowered.com/agecheck/video/%s/?snr=1_agecheck_agecheck__age-gate&ageDay=1&ageMonth=January&ageYear=1970'
366 def suitable(cls, url):
367 """Receives a URL and returns True if suitable for this IE."""
368 return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
370 def _real_extract(self, url):
371 m = re.match(self._VALID_URL, url, re.VERBOSE)
372 gameID = m.group('gameID')
374 videourl = self._VIDEO_PAGE_TEMPLATE % gameID
375 webpage = self._download_webpage(videourl, gameID)
377 if re.search('<h2>Please enter your birth date to continue:</h2>', webpage) is not None:
378 videourl = self._AGECHECK_TEMPLATE % gameID
379 self.report_age_confirmation()
380 webpage = self._download_webpage(videourl, gameID)
382 self.report_extraction(gameID)
383 game_title = self._html_search_regex(r'<h2 class="pageheader">(.*?)</h2>',
384 webpage, 'game title')
386 urlRE = r"'movie_(?P<videoID>\d+)': \{\s*FILENAME: \"(?P<videoURL>[\w:/\.\?=]+)\"(,\s*MOVIE_NAME: \"(?P<videoName>[\w:/\.\?=\+-]+)\")?\s*\},"
387 mweb = re.finditer(urlRE, webpage)
388 namesRE = r'<span class="title">(?P<videoName>.+?)</span>'
389 titles = re.finditer(namesRE, webpage)
390 thumbsRE = r'<img class="movie_thumb" src="(?P<thumbnail>.+?)">'
391 thumbs = re.finditer(thumbsRE, webpage)
393 for vid,vtitle,thumb in zip(mweb,titles,thumbs):
394 video_id = vid.group('videoID')
395 title = vtitle.group('videoName')
396 video_url = vid.group('videoURL')
397 video_thumb = thumb.group('thumbnail')
399 raise ExtractorError(u'Cannot find video url for %s' % video_id)
404 'title': unescapeHTML(title),
405 'thumbnail': video_thumb
408 return [self.playlist_result(videos, gameID, game_title)]
410 class UstreamIE(InfoExtractor):
411 _VALID_URL = r'https?://www\.ustream\.tv/recorded/(?P<videoID>\d+)'
414 def _real_extract(self, url):
415 m = re.match(self._VALID_URL, url)
416 video_id = m.group('videoID')
418 video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
419 webpage = self._download_webpage(url, video_id)
421 self.report_extraction(video_id)
423 video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
426 uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
427 webpage, u'uploader', fatal=False, flags=re.DOTALL)
429 thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
430 webpage, u'thumbnail', fatal=False)
436 'title': video_title,
437 'uploader': uploader,
438 'thumbnail': thumbnail,
442 class WorldStarHipHopIE(InfoExtractor):
443 _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
444 IE_NAME = u'WorldStarHipHop'
446 def _real_extract(self, url):
447 m = re.match(self._VALID_URL, url)
448 video_id = m.group('id')
450 webpage_src = self._download_webpage(url, video_id)
452 video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
453 webpage_src, u'video URL')
455 if 'mp4' in video_url:
460 video_title = self._html_search_regex(r"<title>(.*)</title>",
461 webpage_src, u'title')
463 # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
464 thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
465 webpage_src, u'thumbnail', fatal=False)
468 _title = r"""candytitles.*>(.*)</span>"""
469 mobj = re.search(_title, webpage_src)
471 video_title = mobj.group(1)
476 'title' : video_title,
477 'thumbnail' : thumbnail,
482 class RBMARadioIE(InfoExtractor):
483 _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
485 def _real_extract(self, url):
486 m = re.match(self._VALID_URL, url)
487 video_id = m.group('videoID')
489 webpage = self._download_webpage(url, video_id)
491 json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$',
492 webpage, u'json data', flags=re.MULTILINE)
495 data = json.loads(json_data)
496 except ValueError as e:
497 raise ExtractorError(u'Invalid JSON: ' + str(e))
499 video_url = data['akamai_url'] + '&cbr=256'
500 url_parts = compat_urllib_parse_urlparse(video_url)
501 video_ext = url_parts.path.rpartition('.')[2]
506 'title': data['title'],
507 'description': data.get('teaser_text'),
508 'location': data.get('country_of_origin'),
509 'uploader': data.get('host', {}).get('name'),
510 'uploader_id': data.get('host', {}).get('slug'),
511 'thumbnail': data.get('image', {}).get('large_url_2x'),
512 'duration': data.get('duration'),
517 class YouPornIE(InfoExtractor):
518 """Information extractor for youporn.com."""
519 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P<videoid>[0-9]+)/(?P<title>[^/]+)'
521 def _print_formats(self, formats):
522 """Print all available formats"""
523 print(u'Available formats:')
524 print(u'ext\t\tformat')
525 print(u'---------------------------------')
526 for format in formats:
527 print(u'%s\t\t%s' % (format['ext'], format['format']))
529 def _specific(self, req_format, formats):
531 if(x["format"]==req_format):
535 def _real_extract(self, url):
536 mobj = re.match(self._VALID_URL, url)
538 raise ExtractorError(u'Invalid URL: %s' % url)
539 video_id = mobj.group('videoid')
541 req = compat_urllib_request.Request(url)
542 req.add_header('Cookie', 'age_verified=1')
543 webpage = self._download_webpage(req, video_id)
545 # Get JSON parameters
546 json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
548 params = json.loads(json_params)
550 raise ExtractorError(u'Invalid JSON')
552 self.report_extraction(video_id)
554 video_title = params['title']
555 upload_date = unified_strdate(params['release_date_f'])
556 video_description = params['description']
557 video_uploader = params['submitted_by']
558 thumbnail = params['thumbnails'][0]['image']
560 raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
562 # Get all of the formats available
563 DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
564 download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
565 webpage, u'download list').strip()
567 # Get all of the links from the page
568 LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
569 links = re.findall(LINK_RE, download_list_html)
571 raise ExtractorError(u'ERROR: no known formats available for video')
573 self.to_screen(u'Links found: %d' % len(links))
578 # A link looks like this:
579 # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0
580 # A path looks like this:
581 # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4
582 video_url = unescapeHTML( link )
583 path = compat_urllib_parse_urlparse( video_url ).path
584 extension = os.path.splitext( path )[1][1:]
585 format = path.split('/')[4].split('_')[:2]
588 format = "-".join( format )
589 # title = u'%s-%s-%s' % (video_title, size, bitrate)
594 'uploader': video_uploader,
595 'upload_date': upload_date,
596 'title': video_title,
599 'thumbnail': thumbnail,
600 'description': video_description
603 if self._downloader.params.get('listformats', None):
604 self._print_formats(formats)
607 req_format = self._downloader.params.get('format', None)
608 self.to_screen(u'Format: %s' % req_format)
610 if req_format is None or req_format == 'best':
612 elif req_format == 'worst':
614 elif req_format in ('-1', 'all'):
617 format = self._specific( req_format, formats )
619 raise ExtractorError(u'Requested format not available')
624 class PornotubeIE(InfoExtractor):
625 """Information extractor for pornotube.com."""
626 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$'
628 def _real_extract(self, url):
629 mobj = re.match(self._VALID_URL, url)
631 raise ExtractorError(u'Invalid URL: %s' % url)
633 video_id = mobj.group('videoid')
634 video_title = mobj.group('title')
636 # Get webpage content
637 webpage = self._download_webpage(url, video_id)
640 VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
641 video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
642 video_url = compat_urllib_parse.unquote(video_url)
644 #Get the uploaded date
645 VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
646 upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
647 if upload_date: upload_date = unified_strdate(upload_date)
649 info = {'id': video_id,
652 'upload_date': upload_date,
653 'title': video_title,
659 class YouJizzIE(InfoExtractor):
660 """Information extractor for youjizz.com."""
661 _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
663 def _real_extract(self, url):
664 mobj = re.match(self._VALID_URL, url)
666 raise ExtractorError(u'Invalid URL: %s' % url)
668 video_id = mobj.group('videoid')
670 # Get webpage content
671 webpage = self._download_webpage(url, video_id)
673 # Get the video title
674 video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
675 webpage, u'title').strip()
678 result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
680 raise ExtractorError(u'ERROR: unable to extract embed page')
682 embed_page_url = result.group(0).strip()
683 video_id = result.group('videoid')
685 webpage = self._download_webpage(embed_page_url, video_id)
688 video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
689 webpage, u'video URL')
691 info = {'id': video_id,
693 'title': video_title,
696 'player_url': embed_page_url}
700 class EightTracksIE(InfoExtractor):
702 _VALID_URL = r'https?://8tracks.com/(?P<user>[^/]+)/(?P<id>[^/#]+)(?:#.*)?$'
704 def _real_extract(self, url):
705 mobj = re.match(self._VALID_URL, url)
707 raise ExtractorError(u'Invalid URL: %s' % url)
708 playlist_id = mobj.group('id')
710 webpage = self._download_webpage(url, playlist_id)
712 json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
713 data = json.loads(json_like)
715 session = str(random.randint(0, 1000000000))
717 track_count = data['tracks_count']
718 first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id)
721 for i in itertools.count():
722 api_json = self._download_webpage(next_url, playlist_id,
723 note=u'Downloading song information %s/%s' % (str(i+1), track_count),
724 errnote=u'Failed to download song information')
725 api_data = json.loads(api_json)
726 track_data = api_data[u'set']['track']
728 'id': track_data['id'],
729 'url': track_data['track_file_stream_url'],
730 'title': track_data['performer'] + u' - ' + track_data['name'],
731 'raw_title': track_data['name'],
732 'uploader_id': data['user']['login'],
736 if api_data['set']['at_last_track']:
738 next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id'])
741 class KeekIE(InfoExtractor):
742 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
745 def _real_extract(self, url):
746 m = re.match(self._VALID_URL, url)
747 video_id = m.group('videoID')
749 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
750 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
751 webpage = self._download_webpage(url, video_id)
753 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
756 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
757 webpage, u'uploader', fatal=False)
763 'title': video_title,
764 'thumbnail': thumbnail,
770 class MySpassIE(InfoExtractor):
771 _VALID_URL = r'http://www.myspass.de/.*'
773 def _real_extract(self, url):
774 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
776 # video id is the last path element of the URL
777 # usually there is a trailing slash, so also try the second but last
778 url_path = compat_urllib_parse_urlparse(url).path
779 url_parent_path, video_id = os.path.split(url_path)
781 _, video_id = os.path.split(url_parent_path)
784 metadata_url = META_DATA_URL_TEMPLATE % video_id
785 metadata_text = self._download_webpage(metadata_url, video_id)
786 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
788 # extract values from metadata
789 url_flv_el = metadata.find('url_flv')
790 if url_flv_el is None:
791 raise ExtractorError(u'Unable to extract download url')
792 video_url = url_flv_el.text
793 extension = os.path.splitext(video_url)[1][1:]
794 title_el = metadata.find('title')
796 raise ExtractorError(u'Unable to extract title')
797 title = title_el.text
798 format_id_el = metadata.find('format_id')
799 if format_id_el is None:
802 format = format_id_el.text
803 description_el = metadata.find('description')
804 if description_el is not None:
805 description = description_el.text
808 imagePreview_el = metadata.find('imagePreview')
809 if imagePreview_el is not None:
810 thumbnail = imagePreview_el.text
819 'thumbnail': thumbnail,
820 'description': description
824 class SpiegelIE(InfoExtractor):
825 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
827 def _real_extract(self, url):
828 m = re.match(self._VALID_URL, url)
829 video_id = m.group('videoID')
831 webpage = self._download_webpage(url, video_id)
833 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
836 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
837 xml_code = self._download_webpage(xml_url, video_id,
838 note=u'Downloading XML', errnote=u'Failed to download XML')
840 idoc = xml.etree.ElementTree.fromstring(xml_code)
842 filename = last_type.findall('./filename')[0].text
843 duration = float(last_type.findall('./duration')[0].text)
845 video_url = 'http://video2.spiegel.de/flash/' + filename
846 video_ext = filename.rpartition('.')[2]
851 'title': video_title,
852 'duration': duration,
856 class LiveLeakIE(InfoExtractor):
858 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
859 IE_NAME = u'liveleak'
861 def _real_extract(self, url):
862 mobj = re.match(self._VALID_URL, url)
864 raise ExtractorError(u'Invalid URL: %s' % url)
866 video_id = mobj.group('video_id')
868 webpage = self._download_webpage(url, video_id)
870 video_url = self._search_regex(r'file: "(.*?)",',
871 webpage, u'video URL')
873 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
874 webpage, u'title').replace('LiveLeak.com -', '').strip()
876 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
877 webpage, u'description', fatal=False)
879 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
880 webpage, u'uploader', fatal=False)
886 'title': video_title,
887 'description': video_description,
888 'uploader': video_uploader
895 class TumblrIE(InfoExtractor):
896 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
898 def _real_extract(self, url):
899 m_url = re.match(self._VALID_URL, url)
900 video_id = m_url.group('id')
901 blog = m_url.group('blog_name')
903 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
904 webpage = self._download_webpage(url, video_id)
906 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
907 video = re.search(re_video, webpage)
909 raise ExtractorError(u'Unable to extract video')
910 video_url = video.group('video_url')
911 ext = video.group('ext')
913 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
914 webpage, u'thumbnail', fatal=False) # We pick the first poster
915 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
917 # The only place where you can get a title, it's not complete,
918 # but searching in other places doesn't work for all videos
919 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
920 webpage, u'title', flags=re.DOTALL)
922 return [{'id': video_id,
924 'title': video_title,
925 'thumbnail': video_thumbnail,
929 class BandcampIE(InfoExtractor):
930 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
932 def _real_extract(self, url):
933 mobj = re.match(self._VALID_URL, url)
934 title = mobj.group('title')
935 webpage = self._download_webpage(url, title)
936 # We get the link to the free download page
937 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
938 if m_download is None:
939 raise ExtractorError(u'No free songs found')
941 download_link = m_download.group(1)
942 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
943 webpage, re.MULTILINE|re.DOTALL).group('id')
945 download_webpage = self._download_webpage(download_link, id,
946 'Downloading free downloads page')
947 # We get the dictionary of the track from some javascrip code
948 info = re.search(r'items: (.*?),$',
949 download_webpage, re.MULTILINE).group(1)
950 info = json.loads(info)[0]
951 # We pick mp3-320 for now, until format selection can be easily implemented.
952 mp3_info = info[u'downloads'][u'mp3-320']
953 # If we try to use this url it says the link has expired
954 initial_url = mp3_info[u'url']
955 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
956 m_url = re.match(re_url, initial_url)
957 #We build the url we will use to get the final track url
958 # This url is build in Bandcamp in the script download_bunde_*.js
959 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
960 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
961 # If we could correctly generate the .rand field the url would be
962 #in the "download_url" key
963 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
965 track_info = {'id':id,
966 'title' : info[u'title'],
969 'thumbnail' : info[u'thumb_url'],
970 'uploader' : info[u'artist']
975 class RedTubeIE(InfoExtractor):
976 """Information Extractor for redtube"""
977 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
979 def _real_extract(self,url):
980 mobj = re.match(self._VALID_URL, url)
982 raise ExtractorError(u'Invalid URL: %s' % url)
984 video_id = mobj.group('id')
985 video_extension = 'mp4'
986 webpage = self._download_webpage(url, video_id)
988 self.report_extraction(video_id)
990 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
991 webpage, u'video URL')
993 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
999 'ext': video_extension,
1000 'title': video_title,
1003 class InaIE(InfoExtractor):
1004 """Information Extractor for Ina.fr"""
1005 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
1007 def _real_extract(self,url):
1008 mobj = re.match(self._VALID_URL, url)
1010 video_id = mobj.group('id')
1011 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
1012 video_extension = 'mp4'
1013 webpage = self._download_webpage(mrss_url, video_id)
1015 self.report_extraction(video_id)
1017 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
1018 webpage, u'video URL')
1020 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
1026 'ext': video_extension,
1027 'title': video_title,
1030 class HowcastIE(InfoExtractor):
1031 """Information Extractor for Howcast.com"""
1032 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
1034 def _real_extract(self, url):
1035 mobj = re.match(self._VALID_URL, url)
1037 video_id = mobj.group('id')
1038 webpage_url = 'http://www.howcast.com/videos/' + video_id
1039 webpage = self._download_webpage(webpage_url, video_id)
1041 self.report_extraction(video_id)
1043 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
1044 webpage, u'video URL')
1046 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
1049 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
1050 webpage, u'description', fatal=False)
1052 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
1053 webpage, u'thumbnail', fatal=False)
1059 'title': video_title,
1060 'description': video_description,
1061 'thumbnail': thumbnail,
1064 class VineIE(InfoExtractor):
1065 """Information Extractor for Vine.co"""
1066 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
1068 def _real_extract(self, url):
1069 mobj = re.match(self._VALID_URL, url)
1071 video_id = mobj.group('id')
1072 webpage_url = 'https://vine.co/v/' + video_id
1073 webpage = self._download_webpage(webpage_url, video_id)
1075 self.report_extraction(video_id)
1077 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
1078 webpage, u'video URL')
1080 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1083 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
1084 webpage, u'thumbnail', fatal=False)
1086 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
1087 webpage, u'uploader', fatal=False, flags=re.DOTALL)
1093 'title': video_title,
1094 'thumbnail': thumbnail,
1095 'uploader': uploader,
1098 class FlickrIE(InfoExtractor):
1099 """Information Extractor for Flickr videos"""
1100 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
1102 def _real_extract(self, url):
1103 mobj = re.match(self._VALID_URL, url)
1105 video_id = mobj.group('id')
1106 video_uploader_id = mobj.group('uploader_id')
1107 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
1108 webpage = self._download_webpage(webpage_url, video_id)
1110 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
1112 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
1113 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
1115 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
1116 first_xml, u'node_id')
1118 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
1119 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
1121 self.report_extraction(video_id)
1123 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
1125 raise ExtractorError(u'Unable to extract video url')
1126 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
1128 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
1129 webpage, u'video title')
1131 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
1132 webpage, u'description', fatal=False)
1134 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
1135 webpage, u'thumbnail', fatal=False)
1141 'title': video_title,
1142 'description': video_description,
1143 'thumbnail': thumbnail,
1144 'uploader_id': video_uploader_id,
1147 class TeamcocoIE(InfoExtractor):
1148 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
1150 def _real_extract(self, url):
1151 mobj = re.match(self._VALID_URL, url)
1153 raise ExtractorError(u'Invalid URL: %s' % url)
1154 url_title = mobj.group('url_title')
1155 webpage = self._download_webpage(url, url_title)
1157 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
1158 webpage, u'video id')
1160 self.report_extraction(video_id)
1162 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
1165 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
1166 webpage, u'thumbnail', fatal=False)
1168 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
1169 webpage, u'description', fatal=False)
1171 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
1172 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
1174 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
1181 'title': video_title,
1182 'thumbnail': thumbnail,
1183 'description': video_description,
1186 class XHamsterIE(InfoExtractor):
1187 """Information Extractor for xHamster"""
1188 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
1190 def _real_extract(self,url):
1191 mobj = re.match(self._VALID_URL, url)
1193 video_id = mobj.group('id')
1194 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
1195 webpage = self._download_webpage(mrss_url, video_id)
1197 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
1199 raise ExtractorError(u'Unable to extract media URL')
1200 if len(mobj.group('server')) == 0:
1201 video_url = compat_urllib_parse.unquote(mobj.group('file'))
1203 video_url = mobj.group('server')+'/key='+mobj.group('file')
1204 video_extension = video_url.split('.')[-1]
1206 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
1209 # Can't see the description anywhere in the UI
1210 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
1211 # webpage, u'description', fatal=False)
1212 # if video_description: video_description = unescapeHTML(video_description)
1214 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
1216 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
1218 video_upload_date = None
1219 self._downloader.report_warning(u'Unable to extract upload date')
1221 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
1222 webpage, u'uploader id', default=u'anonymous')
1224 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
1225 webpage, u'thumbnail', fatal=False)
1230 'ext': video_extension,
1231 'title': video_title,
1232 # 'description': video_description,
1233 'upload_date': video_upload_date,
1234 'uploader_id': video_uploader_id,
1235 'thumbnail': video_thumbnail
1238 class HypemIE(InfoExtractor):
1239 """Information Extractor for hypem"""
1240 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
1242 def _real_extract(self, url):
1243 mobj = re.match(self._VALID_URL, url)
1245 raise ExtractorError(u'Invalid URL: %s' % url)
1246 track_id = mobj.group(1)
1248 data = { 'ax': 1, 'ts': time.time() }
1249 data_encoded = compat_urllib_parse.urlencode(data)
1250 complete_url = url + "?" + data_encoded
1251 request = compat_urllib_request.Request(complete_url)
1252 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
1253 cookie = urlh.headers.get('Set-Cookie', '')
1255 self.report_extraction(track_id)
1257 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
1258 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
1260 track_list = json.loads(html_tracks)
1261 track = track_list[u'tracks'][0]
1263 raise ExtractorError(u'Hypemachine contained invalid JSON.')
1266 track_id = track[u"id"]
1267 artist = track[u"artist"]
1268 title = track[u"song"]
1270 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
1271 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
1272 request.add_header('cookie', cookie)
1273 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
1275 song_data = json.loads(song_data_json)
1277 raise ExtractorError(u'Hypemachine contained invalid JSON.')
1278 final_url = song_data[u"url"]
1288 class Vbox7IE(InfoExtractor):
1289 """Information Extractor for Vbox7"""
1290 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
1292 def _real_extract(self,url):
1293 mobj = re.match(self._VALID_URL, url)
1295 raise ExtractorError(u'Invalid URL: %s' % url)
1296 video_id = mobj.group(1)
1298 redirect_page, urlh = self._download_webpage_handle(url, video_id)
1299 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
1300 redirect_url = urlh.geturl() + new_location
1301 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
1303 title = self._html_search_regex(r'<title>(.*)</title>',
1304 webpage, u'title').split('/')[0].strip()
1307 info_url = "http://vbox7.com/play/magare.do"
1308 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
1309 info_request = compat_urllib_request.Request(info_url, data)
1310 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
1311 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
1312 if info_response is None:
1313 raise ExtractorError(u'Unable to extract the media url')
1314 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
1321 'thumbnail': thumbnail_url,
1325 def gen_extractors():
1326 """ Return a list of an instance of every supported extractor.
1327 The order does matter; the first extractor matched is the one handling the URL.
1330 YoutubePlaylistIE(),
1355 StanfordOpenClassroomIE(),
1365 WorldStarHipHopIE(),
1395 def get_info_extractor(ie_name):
1396 """Returns the info extractor class with the given ie_name"""
1397 return globals()[ie_name+'IE']