10 import xml.etree.ElementTree
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.collegehumor import CollegeHumorIE
26 from .extractor.dailymotion import DailymotionIE
27 from .extractor.depositfiles import DepositFilesIE
28 from .extractor.eighttracks import EightTracksIE
29 from .extractor.escapist import EscapistIE
30 from .extractor.facebook import FacebookIE
31 from .extractor.funnyordie import FunnyOrDieIE
32 from .extractor.gametrailers import GametrailersIE
33 from .extractor.generic import GenericIE
34 from .extractor.googleplus import GooglePlusIE
35 from .extractor.googlesearch import GoogleSearchIE
36 from .extractor.infoq import InfoQIE
37 from .extractor.justintv import JustinTVIE
38 from .extractor.keek import KeekIE
39 from .extractor.metacafe import MetacafeIE
40 from .extractor.mixcloud import MixcloudIE
41 from .extractor.mtv import MTVIE
42 from .extractor.myspass import MySpassIE
43 from .extractor.myvideo import MyVideoIE
44 from .extractor.nba import NBAIE
45 from .extractor.statigram import StatigramIE
46 from .extractor.photobucket import PhotobucketIE
47 from .extractor.pornotube import PornotubeIE
48 from .extractor.rbmaradio import RBMARadioIE
49 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
50 from .extractor.stanfordoc import StanfordOpenClassroomIE
51 from .extractor.steam import SteamIE
52 from .extractor.ted import TEDIE
53 from .extractor.ustream import UstreamIE
54 from .extractor.vimeo import VimeoIE
55 from .extractor.worldstarhiphop import WorldStarHipHopIE
56 from .extractor.xnxx import XNXXIE
57 from .extractor.xvideos import XVideosIE
58 from .extractor.yahoo import YahooIE, YahooSearchIE
59 from .extractor.youjizz import YouJizzIE
60 from .extractor.youku import YoukuIE
61 from .extractor.youporn import YouPornIE
62 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
63 from .extractor.zdf import ZDFIE
93 class SpiegelIE(InfoExtractor):
94 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
96 def _real_extract(self, url):
97 m = re.match(self._VALID_URL, url)
98 video_id = m.group('videoID')
100 webpage = self._download_webpage(url, video_id)
102 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
105 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
106 xml_code = self._download_webpage(xml_url, video_id,
107 note=u'Downloading XML', errnote=u'Failed to download XML')
109 idoc = xml.etree.ElementTree.fromstring(xml_code)
111 filename = last_type.findall('./filename')[0].text
112 duration = float(last_type.findall('./duration')[0].text)
114 video_url = 'http://video2.spiegel.de/flash/' + filename
115 video_ext = filename.rpartition('.')[2]
120 'title': video_title,
121 'duration': duration,
125 class LiveLeakIE(InfoExtractor):
127 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
128 IE_NAME = u'liveleak'
130 def _real_extract(self, url):
131 mobj = re.match(self._VALID_URL, url)
133 raise ExtractorError(u'Invalid URL: %s' % url)
135 video_id = mobj.group('video_id')
137 webpage = self._download_webpage(url, video_id)
139 video_url = self._search_regex(r'file: "(.*?)",',
140 webpage, u'video URL')
142 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
143 webpage, u'title').replace('LiveLeak.com -', '').strip()
145 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
146 webpage, u'description', fatal=False)
148 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
149 webpage, u'uploader', fatal=False)
155 'title': video_title,
156 'description': video_description,
157 'uploader': video_uploader
164 class TumblrIE(InfoExtractor):
165 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
167 def _real_extract(self, url):
168 m_url = re.match(self._VALID_URL, url)
169 video_id = m_url.group('id')
170 blog = m_url.group('blog_name')
172 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
173 webpage = self._download_webpage(url, video_id)
175 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
176 video = re.search(re_video, webpage)
178 raise ExtractorError(u'Unable to extract video')
179 video_url = video.group('video_url')
180 ext = video.group('ext')
182 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
183 webpage, u'thumbnail', fatal=False) # We pick the first poster
184 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
186 # The only place where you can get a title, it's not complete,
187 # but searching in other places doesn't work for all videos
188 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
189 webpage, u'title', flags=re.DOTALL)
191 return [{'id': video_id,
193 'title': video_title,
194 'thumbnail': video_thumbnail,
198 class BandcampIE(InfoExtractor):
199 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
201 def _real_extract(self, url):
202 mobj = re.match(self._VALID_URL, url)
203 title = mobj.group('title')
204 webpage = self._download_webpage(url, title)
205 # We get the link to the free download page
206 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
207 if m_download is None:
208 raise ExtractorError(u'No free songs found')
210 download_link = m_download.group(1)
211 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
212 webpage, re.MULTILINE|re.DOTALL).group('id')
214 download_webpage = self._download_webpage(download_link, id,
215 'Downloading free downloads page')
216 # We get the dictionary of the track from some javascrip code
217 info = re.search(r'items: (.*?),$',
218 download_webpage, re.MULTILINE).group(1)
219 info = json.loads(info)[0]
220 # We pick mp3-320 for now, until format selection can be easily implemented.
221 mp3_info = info[u'downloads'][u'mp3-320']
222 # If we try to use this url it says the link has expired
223 initial_url = mp3_info[u'url']
224 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
225 m_url = re.match(re_url, initial_url)
226 #We build the url we will use to get the final track url
227 # This url is build in Bandcamp in the script download_bunde_*.js
228 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
229 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
230 # If we could correctly generate the .rand field the url would be
231 #in the "download_url" key
232 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
234 track_info = {'id':id,
235 'title' : info[u'title'],
238 'thumbnail' : info[u'thumb_url'],
239 'uploader' : info[u'artist']
244 class RedTubeIE(InfoExtractor):
245 """Information Extractor for redtube"""
246 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
248 def _real_extract(self,url):
249 mobj = re.match(self._VALID_URL, url)
251 raise ExtractorError(u'Invalid URL: %s' % url)
253 video_id = mobj.group('id')
254 video_extension = 'mp4'
255 webpage = self._download_webpage(url, video_id)
257 self.report_extraction(video_id)
259 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
260 webpage, u'video URL')
262 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
268 'ext': video_extension,
269 'title': video_title,
272 class InaIE(InfoExtractor):
273 """Information Extractor for Ina.fr"""
274 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
276 def _real_extract(self,url):
277 mobj = re.match(self._VALID_URL, url)
279 video_id = mobj.group('id')
280 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
281 video_extension = 'mp4'
282 webpage = self._download_webpage(mrss_url, video_id)
284 self.report_extraction(video_id)
286 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
287 webpage, u'video URL')
289 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
295 'ext': video_extension,
296 'title': video_title,
299 class HowcastIE(InfoExtractor):
300 """Information Extractor for Howcast.com"""
301 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
303 def _real_extract(self, url):
304 mobj = re.match(self._VALID_URL, url)
306 video_id = mobj.group('id')
307 webpage_url = 'http://www.howcast.com/videos/' + video_id
308 webpage = self._download_webpage(webpage_url, video_id)
310 self.report_extraction(video_id)
312 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
313 webpage, u'video URL')
315 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
318 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
319 webpage, u'description', fatal=False)
321 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
322 webpage, u'thumbnail', fatal=False)
328 'title': video_title,
329 'description': video_description,
330 'thumbnail': thumbnail,
333 class VineIE(InfoExtractor):
334 """Information Extractor for Vine.co"""
335 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
337 def _real_extract(self, url):
338 mobj = re.match(self._VALID_URL, url)
340 video_id = mobj.group('id')
341 webpage_url = 'https://vine.co/v/' + video_id
342 webpage = self._download_webpage(webpage_url, video_id)
344 self.report_extraction(video_id)
346 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
347 webpage, u'video URL')
349 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
352 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
353 webpage, u'thumbnail', fatal=False)
355 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
356 webpage, u'uploader', fatal=False, flags=re.DOTALL)
362 'title': video_title,
363 'thumbnail': thumbnail,
364 'uploader': uploader,
367 class FlickrIE(InfoExtractor):
368 """Information Extractor for Flickr videos"""
369 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
371 def _real_extract(self, url):
372 mobj = re.match(self._VALID_URL, url)
374 video_id = mobj.group('id')
375 video_uploader_id = mobj.group('uploader_id')
376 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
377 webpage = self._download_webpage(webpage_url, video_id)
379 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
381 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
382 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
384 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
385 first_xml, u'node_id')
387 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
388 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
390 self.report_extraction(video_id)
392 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
394 raise ExtractorError(u'Unable to extract video url')
395 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
397 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
398 webpage, u'video title')
400 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
401 webpage, u'description', fatal=False)
403 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
404 webpage, u'thumbnail', fatal=False)
410 'title': video_title,
411 'description': video_description,
412 'thumbnail': thumbnail,
413 'uploader_id': video_uploader_id,
416 class TeamcocoIE(InfoExtractor):
417 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
419 def _real_extract(self, url):
420 mobj = re.match(self._VALID_URL, url)
422 raise ExtractorError(u'Invalid URL: %s' % url)
423 url_title = mobj.group('url_title')
424 webpage = self._download_webpage(url, url_title)
426 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
427 webpage, u'video id')
429 self.report_extraction(video_id)
431 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
434 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
435 webpage, u'thumbnail', fatal=False)
437 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
438 webpage, u'description', fatal=False)
440 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
441 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
443 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
450 'title': video_title,
451 'thumbnail': thumbnail,
452 'description': video_description,
455 class XHamsterIE(InfoExtractor):
456 """Information Extractor for xHamster"""
457 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
459 def _real_extract(self,url):
460 mobj = re.match(self._VALID_URL, url)
462 video_id = mobj.group('id')
463 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
464 webpage = self._download_webpage(mrss_url, video_id)
466 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
468 raise ExtractorError(u'Unable to extract media URL')
469 if len(mobj.group('server')) == 0:
470 video_url = compat_urllib_parse.unquote(mobj.group('file'))
472 video_url = mobj.group('server')+'/key='+mobj.group('file')
473 video_extension = video_url.split('.')[-1]
475 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
478 # Can't see the description anywhere in the UI
479 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
480 # webpage, u'description', fatal=False)
481 # if video_description: video_description = unescapeHTML(video_description)
483 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
485 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
487 video_upload_date = None
488 self._downloader.report_warning(u'Unable to extract upload date')
490 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
491 webpage, u'uploader id', default=u'anonymous')
493 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
494 webpage, u'thumbnail', fatal=False)
499 'ext': video_extension,
500 'title': video_title,
501 # 'description': video_description,
502 'upload_date': video_upload_date,
503 'uploader_id': video_uploader_id,
504 'thumbnail': video_thumbnail
507 class HypemIE(InfoExtractor):
508 """Information Extractor for hypem"""
509 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
511 def _real_extract(self, url):
512 mobj = re.match(self._VALID_URL, url)
514 raise ExtractorError(u'Invalid URL: %s' % url)
515 track_id = mobj.group(1)
517 data = { 'ax': 1, 'ts': time.time() }
518 data_encoded = compat_urllib_parse.urlencode(data)
519 complete_url = url + "?" + data_encoded
520 request = compat_urllib_request.Request(complete_url)
521 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
522 cookie = urlh.headers.get('Set-Cookie', '')
524 self.report_extraction(track_id)
526 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
527 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
529 track_list = json.loads(html_tracks)
530 track = track_list[u'tracks'][0]
532 raise ExtractorError(u'Hypemachine contained invalid JSON.')
535 track_id = track[u"id"]
536 artist = track[u"artist"]
537 title = track[u"song"]
539 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
540 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
541 request.add_header('cookie', cookie)
542 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
544 song_data = json.loads(song_data_json)
546 raise ExtractorError(u'Hypemachine contained invalid JSON.')
547 final_url = song_data[u"url"]
557 class Vbox7IE(InfoExtractor):
558 """Information Extractor for Vbox7"""
559 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
561 def _real_extract(self,url):
562 mobj = re.match(self._VALID_URL, url)
564 raise ExtractorError(u'Invalid URL: %s' % url)
565 video_id = mobj.group(1)
567 redirect_page, urlh = self._download_webpage_handle(url, video_id)
568 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
569 redirect_url = urlh.geturl() + new_location
570 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
572 title = self._html_search_regex(r'<title>(.*)</title>',
573 webpage, u'title').split('/')[0].strip()
576 info_url = "http://vbox7.com/play/magare.do"
577 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
578 info_request = compat_urllib_request.Request(info_url, data)
579 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
580 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
581 if info_response is None:
582 raise ExtractorError(u'Unable to extract the media url')
583 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
590 'thumbnail': thumbnail_url,
594 def gen_extractors():
595 """ Return a list of an instance of every supported extractor.
596 The order does matter; the first extractor matched is the one handling the URL.
624 StanfordOpenClassroomIE(),
664 def get_info_extractor(ie_name):
665 """Returns the info extractor class with the given ie_name"""
666 return globals()[ie_name+'IE']