10 import xml.etree.ElementTree
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.collegehumor import CollegeHumorIE
26 from .extractor.dailymotion import DailymotionIE
27 from .extractor.depositfiles import DepositFilesIE
28 from .extractor.eighttracks import EightTracksIE
29 from .extractor.escapist import EscapistIE
30 from .extractor.facebook import FacebookIE
31 from .extractor.funnyordie import FunnyOrDieIE
32 from .extractor.gametrailers import GametrailersIE
33 from .extractor.generic import GenericIE
34 from .extractor.googleplus import GooglePlusIE
35 from .extractor.googlesearch import GoogleSearchIE
36 from .extractor.infoq import InfoQIE
37 from .extractor.justintv import JustinTVIE
38 from .extractor.metacafe import MetacafeIE
39 from .extractor.mixcloud import MixcloudIE
40 from .extractor.mtv import MTVIE
41 from .extractor.myvideo import MyVideoIE
42 from .extractor.nba import NBAIE
43 from .extractor.statigram import StatigramIE
44 from .extractor.photobucket import PhotobucketIE
45 from .extractor.pornotube import PornotubeIE
46 from .extractor.rbmaradio import RBMARadioIE
47 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
48 from .extractor.stanfordoc import StanfordOpenClassroomIE
49 from .extractor.steam import SteamIE
50 from .extractor.ted import TEDIE
51 from .extractor.ustream import UstreamIE
52 from .extractor.vimeo import VimeoIE
53 from .extractor.worldstarhiphop import WorldStarHipHopIE
54 from .extractor.xnxx import XNXXIE
55 from .extractor.xvideos import XVideosIE
56 from .extractor.yahoo import YahooIE, YahooSearchIE
57 from .extractor.youjizz import YouJizzIE
58 from .extractor.youku import YoukuIE
59 from .extractor.youporn import YouPornIE
60 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
61 from .extractor.zdf import ZDFIE
88 class KeekIE(InfoExtractor):
89 _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
92 def _real_extract(self, url):
93 m = re.match(self._VALID_URL, url)
94 video_id = m.group('videoID')
96 video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
97 thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
98 webpage = self._download_webpage(url, video_id)
100 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
103 uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
104 webpage, u'uploader', fatal=False)
110 'title': video_title,
111 'thumbnail': thumbnail,
117 class MySpassIE(InfoExtractor):
118 _VALID_URL = r'http://www.myspass.de/.*'
120 def _real_extract(self, url):
121 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
123 # video id is the last path element of the URL
124 # usually there is a trailing slash, so also try the second but last
125 url_path = compat_urllib_parse_urlparse(url).path
126 url_parent_path, video_id = os.path.split(url_path)
128 _, video_id = os.path.split(url_parent_path)
131 metadata_url = META_DATA_URL_TEMPLATE % video_id
132 metadata_text = self._download_webpage(metadata_url, video_id)
133 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
135 # extract values from metadata
136 url_flv_el = metadata.find('url_flv')
137 if url_flv_el is None:
138 raise ExtractorError(u'Unable to extract download url')
139 video_url = url_flv_el.text
140 extension = os.path.splitext(video_url)[1][1:]
141 title_el = metadata.find('title')
143 raise ExtractorError(u'Unable to extract title')
144 title = title_el.text
145 format_id_el = metadata.find('format_id')
146 if format_id_el is None:
149 format = format_id_el.text
150 description_el = metadata.find('description')
151 if description_el is not None:
152 description = description_el.text
155 imagePreview_el = metadata.find('imagePreview')
156 if imagePreview_el is not None:
157 thumbnail = imagePreview_el.text
166 'thumbnail': thumbnail,
167 'description': description
171 class SpiegelIE(InfoExtractor):
172 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
174 def _real_extract(self, url):
175 m = re.match(self._VALID_URL, url)
176 video_id = m.group('videoID')
178 webpage = self._download_webpage(url, video_id)
180 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
183 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
184 xml_code = self._download_webpage(xml_url, video_id,
185 note=u'Downloading XML', errnote=u'Failed to download XML')
187 idoc = xml.etree.ElementTree.fromstring(xml_code)
189 filename = last_type.findall('./filename')[0].text
190 duration = float(last_type.findall('./duration')[0].text)
192 video_url = 'http://video2.spiegel.de/flash/' + filename
193 video_ext = filename.rpartition('.')[2]
198 'title': video_title,
199 'duration': duration,
203 class LiveLeakIE(InfoExtractor):
205 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
206 IE_NAME = u'liveleak'
208 def _real_extract(self, url):
209 mobj = re.match(self._VALID_URL, url)
211 raise ExtractorError(u'Invalid URL: %s' % url)
213 video_id = mobj.group('video_id')
215 webpage = self._download_webpage(url, video_id)
217 video_url = self._search_regex(r'file: "(.*?)",',
218 webpage, u'video URL')
220 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
221 webpage, u'title').replace('LiveLeak.com -', '').strip()
223 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
224 webpage, u'description', fatal=False)
226 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
227 webpage, u'uploader', fatal=False)
233 'title': video_title,
234 'description': video_description,
235 'uploader': video_uploader
242 class TumblrIE(InfoExtractor):
243 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
245 def _real_extract(self, url):
246 m_url = re.match(self._VALID_URL, url)
247 video_id = m_url.group('id')
248 blog = m_url.group('blog_name')
250 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
251 webpage = self._download_webpage(url, video_id)
253 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
254 video = re.search(re_video, webpage)
256 raise ExtractorError(u'Unable to extract video')
257 video_url = video.group('video_url')
258 ext = video.group('ext')
260 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
261 webpage, u'thumbnail', fatal=False) # We pick the first poster
262 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
264 # The only place where you can get a title, it's not complete,
265 # but searching in other places doesn't work for all videos
266 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
267 webpage, u'title', flags=re.DOTALL)
269 return [{'id': video_id,
271 'title': video_title,
272 'thumbnail': video_thumbnail,
276 class BandcampIE(InfoExtractor):
277 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
279 def _real_extract(self, url):
280 mobj = re.match(self._VALID_URL, url)
281 title = mobj.group('title')
282 webpage = self._download_webpage(url, title)
283 # We get the link to the free download page
284 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
285 if m_download is None:
286 raise ExtractorError(u'No free songs found')
288 download_link = m_download.group(1)
289 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
290 webpage, re.MULTILINE|re.DOTALL).group('id')
292 download_webpage = self._download_webpage(download_link, id,
293 'Downloading free downloads page')
294 # We get the dictionary of the track from some javascrip code
295 info = re.search(r'items: (.*?),$',
296 download_webpage, re.MULTILINE).group(1)
297 info = json.loads(info)[0]
298 # We pick mp3-320 for now, until format selection can be easily implemented.
299 mp3_info = info[u'downloads'][u'mp3-320']
300 # If we try to use this url it says the link has expired
301 initial_url = mp3_info[u'url']
302 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
303 m_url = re.match(re_url, initial_url)
304 #We build the url we will use to get the final track url
305 # This url is build in Bandcamp in the script download_bunde_*.js
306 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
307 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
308 # If we could correctly generate the .rand field the url would be
309 #in the "download_url" key
310 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
312 track_info = {'id':id,
313 'title' : info[u'title'],
316 'thumbnail' : info[u'thumb_url'],
317 'uploader' : info[u'artist']
322 class RedTubeIE(InfoExtractor):
323 """Information Extractor for redtube"""
324 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
326 def _real_extract(self,url):
327 mobj = re.match(self._VALID_URL, url)
329 raise ExtractorError(u'Invalid URL: %s' % url)
331 video_id = mobj.group('id')
332 video_extension = 'mp4'
333 webpage = self._download_webpage(url, video_id)
335 self.report_extraction(video_id)
337 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
338 webpage, u'video URL')
340 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
346 'ext': video_extension,
347 'title': video_title,
350 class InaIE(InfoExtractor):
351 """Information Extractor for Ina.fr"""
352 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
354 def _real_extract(self,url):
355 mobj = re.match(self._VALID_URL, url)
357 video_id = mobj.group('id')
358 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
359 video_extension = 'mp4'
360 webpage = self._download_webpage(mrss_url, video_id)
362 self.report_extraction(video_id)
364 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
365 webpage, u'video URL')
367 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
373 'ext': video_extension,
374 'title': video_title,
377 class HowcastIE(InfoExtractor):
378 """Information Extractor for Howcast.com"""
379 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
381 def _real_extract(self, url):
382 mobj = re.match(self._VALID_URL, url)
384 video_id = mobj.group('id')
385 webpage_url = 'http://www.howcast.com/videos/' + video_id
386 webpage = self._download_webpage(webpage_url, video_id)
388 self.report_extraction(video_id)
390 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
391 webpage, u'video URL')
393 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
396 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
397 webpage, u'description', fatal=False)
399 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
400 webpage, u'thumbnail', fatal=False)
406 'title': video_title,
407 'description': video_description,
408 'thumbnail': thumbnail,
411 class VineIE(InfoExtractor):
412 """Information Extractor for Vine.co"""
413 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
415 def _real_extract(self, url):
416 mobj = re.match(self._VALID_URL, url)
418 video_id = mobj.group('id')
419 webpage_url = 'https://vine.co/v/' + video_id
420 webpage = self._download_webpage(webpage_url, video_id)
422 self.report_extraction(video_id)
424 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
425 webpage, u'video URL')
427 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
430 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
431 webpage, u'thumbnail', fatal=False)
433 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
434 webpage, u'uploader', fatal=False, flags=re.DOTALL)
440 'title': video_title,
441 'thumbnail': thumbnail,
442 'uploader': uploader,
445 class FlickrIE(InfoExtractor):
446 """Information Extractor for Flickr videos"""
447 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
449 def _real_extract(self, url):
450 mobj = re.match(self._VALID_URL, url)
452 video_id = mobj.group('id')
453 video_uploader_id = mobj.group('uploader_id')
454 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
455 webpage = self._download_webpage(webpage_url, video_id)
457 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
459 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
460 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
462 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
463 first_xml, u'node_id')
465 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
466 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
468 self.report_extraction(video_id)
470 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
472 raise ExtractorError(u'Unable to extract video url')
473 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
475 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
476 webpage, u'video title')
478 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
479 webpage, u'description', fatal=False)
481 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
482 webpage, u'thumbnail', fatal=False)
488 'title': video_title,
489 'description': video_description,
490 'thumbnail': thumbnail,
491 'uploader_id': video_uploader_id,
494 class TeamcocoIE(InfoExtractor):
495 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
497 def _real_extract(self, url):
498 mobj = re.match(self._VALID_URL, url)
500 raise ExtractorError(u'Invalid URL: %s' % url)
501 url_title = mobj.group('url_title')
502 webpage = self._download_webpage(url, url_title)
504 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
505 webpage, u'video id')
507 self.report_extraction(video_id)
509 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
512 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
513 webpage, u'thumbnail', fatal=False)
515 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
516 webpage, u'description', fatal=False)
518 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
519 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
521 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
528 'title': video_title,
529 'thumbnail': thumbnail,
530 'description': video_description,
533 class XHamsterIE(InfoExtractor):
534 """Information Extractor for xHamster"""
535 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
537 def _real_extract(self,url):
538 mobj = re.match(self._VALID_URL, url)
540 video_id = mobj.group('id')
541 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
542 webpage = self._download_webpage(mrss_url, video_id)
544 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
546 raise ExtractorError(u'Unable to extract media URL')
547 if len(mobj.group('server')) == 0:
548 video_url = compat_urllib_parse.unquote(mobj.group('file'))
550 video_url = mobj.group('server')+'/key='+mobj.group('file')
551 video_extension = video_url.split('.')[-1]
553 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
556 # Can't see the description anywhere in the UI
557 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
558 # webpage, u'description', fatal=False)
559 # if video_description: video_description = unescapeHTML(video_description)
561 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
563 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
565 video_upload_date = None
566 self._downloader.report_warning(u'Unable to extract upload date')
568 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
569 webpage, u'uploader id', default=u'anonymous')
571 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
572 webpage, u'thumbnail', fatal=False)
577 'ext': video_extension,
578 'title': video_title,
579 # 'description': video_description,
580 'upload_date': video_upload_date,
581 'uploader_id': video_uploader_id,
582 'thumbnail': video_thumbnail
585 class HypemIE(InfoExtractor):
586 """Information Extractor for hypem"""
587 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
589 def _real_extract(self, url):
590 mobj = re.match(self._VALID_URL, url)
592 raise ExtractorError(u'Invalid URL: %s' % url)
593 track_id = mobj.group(1)
595 data = { 'ax': 1, 'ts': time.time() }
596 data_encoded = compat_urllib_parse.urlencode(data)
597 complete_url = url + "?" + data_encoded
598 request = compat_urllib_request.Request(complete_url)
599 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
600 cookie = urlh.headers.get('Set-Cookie', '')
602 self.report_extraction(track_id)
604 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
605 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
607 track_list = json.loads(html_tracks)
608 track = track_list[u'tracks'][0]
610 raise ExtractorError(u'Hypemachine contained invalid JSON.')
613 track_id = track[u"id"]
614 artist = track[u"artist"]
615 title = track[u"song"]
617 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
618 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
619 request.add_header('cookie', cookie)
620 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
622 song_data = json.loads(song_data_json)
624 raise ExtractorError(u'Hypemachine contained invalid JSON.')
625 final_url = song_data[u"url"]
635 class Vbox7IE(InfoExtractor):
636 """Information Extractor for Vbox7"""
637 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
639 def _real_extract(self,url):
640 mobj = re.match(self._VALID_URL, url)
642 raise ExtractorError(u'Invalid URL: %s' % url)
643 video_id = mobj.group(1)
645 redirect_page, urlh = self._download_webpage_handle(url, video_id)
646 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
647 redirect_url = urlh.geturl() + new_location
648 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
650 title = self._html_search_regex(r'<title>(.*)</title>',
651 webpage, u'title').split('/')[0].strip()
654 info_url = "http://vbox7.com/play/magare.do"
655 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
656 info_request = compat_urllib_request.Request(info_url, data)
657 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
658 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
659 if info_response is None:
660 raise ExtractorError(u'Unable to extract the media url')
661 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
668 'thumbnail': thumbnail_url,
672 def gen_extractors():
673 """ Return a list of an instance of every supported extractor.
674 The order does matter; the first extractor matched is the one handling the URL.
702 StanfordOpenClassroomIE(),
742 def get_info_extractor(ie_name):
743 """Returns the info extractor class with the given ie_name"""
744 return globals()[ie_name+'IE']