10 import xml.etree.ElementTree
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.collegehumor import CollegeHumorIE
26 from .extractor.dailymotion import DailymotionIE
27 from .extractor.depositfiles import DepositFilesIE
28 from .extractor.eighttracks import EightTracksIE
29 from .extractor.escapist import EscapistIE
30 from .extractor.facebook import FacebookIE
31 from .extractor.funnyordie import FunnyOrDieIE
32 from .extractor.gametrailers import GametrailersIE
33 from .extractor.generic import GenericIE
34 from .extractor.googleplus import GooglePlusIE
35 from .extractor.googlesearch import GoogleSearchIE
36 from .extractor.infoq import InfoQIE
37 from .extractor.justintv import JustinTVIE
38 from .extractor.keek import KeekIE
39 from .extractor.metacafe import MetacafeIE
40 from .extractor.mixcloud import MixcloudIE
41 from .extractor.mtv import MTVIE
42 from .extractor.myspass import MySpassIE
43 from .extractor.myvideo import MyVideoIE
44 from .extractor.nba import NBAIE
45 from .extractor.statigram import StatigramIE
46 from .extractor.photobucket import PhotobucketIE
47 from .extractor.pornotube import PornotubeIE
48 from .extractor.rbmaradio import RBMARadioIE
49 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
50 from .extractor.spiegel import SpiegelIE
51 from .extractor.stanfordoc import StanfordOpenClassroomIE
52 from .extractor.steam import SteamIE
53 from .extractor.ted import TEDIE
54 from .extractor.ustream import UstreamIE
55 from .extractor.vimeo import VimeoIE
56 from .extractor.worldstarhiphop import WorldStarHipHopIE
57 from .extractor.xnxx import XNXXIE
58 from .extractor.xvideos import XVideosIE
59 from .extractor.yahoo import YahooIE, YahooSearchIE
60 from .extractor.youjizz import YouJizzIE
61 from .extractor.youku import YoukuIE
62 from .extractor.youporn import YouPornIE
63 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
64 from .extractor.zdf import ZDFIE
95 class LiveLeakIE(InfoExtractor):
97 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
100 def _real_extract(self, url):
101 mobj = re.match(self._VALID_URL, url)
103 raise ExtractorError(u'Invalid URL: %s' % url)
105 video_id = mobj.group('video_id')
107 webpage = self._download_webpage(url, video_id)
109 video_url = self._search_regex(r'file: "(.*?)",',
110 webpage, u'video URL')
112 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
113 webpage, u'title').replace('LiveLeak.com -', '').strip()
115 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
116 webpage, u'description', fatal=False)
118 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
119 webpage, u'uploader', fatal=False)
125 'title': video_title,
126 'description': video_description,
127 'uploader': video_uploader
134 class TumblrIE(InfoExtractor):
135 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
137 def _real_extract(self, url):
138 m_url = re.match(self._VALID_URL, url)
139 video_id = m_url.group('id')
140 blog = m_url.group('blog_name')
142 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
143 webpage = self._download_webpage(url, video_id)
145 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
146 video = re.search(re_video, webpage)
148 raise ExtractorError(u'Unable to extract video')
149 video_url = video.group('video_url')
150 ext = video.group('ext')
152 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
153 webpage, u'thumbnail', fatal=False) # We pick the first poster
154 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
156 # The only place where you can get a title, it's not complete,
157 # but searching in other places doesn't work for all videos
158 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
159 webpage, u'title', flags=re.DOTALL)
161 return [{'id': video_id,
163 'title': video_title,
164 'thumbnail': video_thumbnail,
168 class BandcampIE(InfoExtractor):
169 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
171 def _real_extract(self, url):
172 mobj = re.match(self._VALID_URL, url)
173 title = mobj.group('title')
174 webpage = self._download_webpage(url, title)
175 # We get the link to the free download page
176 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
177 if m_download is None:
178 raise ExtractorError(u'No free songs found')
180 download_link = m_download.group(1)
181 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
182 webpage, re.MULTILINE|re.DOTALL).group('id')
184 download_webpage = self._download_webpage(download_link, id,
185 'Downloading free downloads page')
186 # We get the dictionary of the track from some javascrip code
187 info = re.search(r'items: (.*?),$',
188 download_webpage, re.MULTILINE).group(1)
189 info = json.loads(info)[0]
190 # We pick mp3-320 for now, until format selection can be easily implemented.
191 mp3_info = info[u'downloads'][u'mp3-320']
192 # If we try to use this url it says the link has expired
193 initial_url = mp3_info[u'url']
194 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
195 m_url = re.match(re_url, initial_url)
196 #We build the url we will use to get the final track url
197 # This url is build in Bandcamp in the script download_bunde_*.js
198 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
199 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
200 # If we could correctly generate the .rand field the url would be
201 #in the "download_url" key
202 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
204 track_info = {'id':id,
205 'title' : info[u'title'],
208 'thumbnail' : info[u'thumb_url'],
209 'uploader' : info[u'artist']
214 class RedTubeIE(InfoExtractor):
215 """Information Extractor for redtube"""
216 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
218 def _real_extract(self,url):
219 mobj = re.match(self._VALID_URL, url)
221 raise ExtractorError(u'Invalid URL: %s' % url)
223 video_id = mobj.group('id')
224 video_extension = 'mp4'
225 webpage = self._download_webpage(url, video_id)
227 self.report_extraction(video_id)
229 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
230 webpage, u'video URL')
232 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
238 'ext': video_extension,
239 'title': video_title,
242 class InaIE(InfoExtractor):
243 """Information Extractor for Ina.fr"""
244 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
246 def _real_extract(self,url):
247 mobj = re.match(self._VALID_URL, url)
249 video_id = mobj.group('id')
250 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
251 video_extension = 'mp4'
252 webpage = self._download_webpage(mrss_url, video_id)
254 self.report_extraction(video_id)
256 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
257 webpage, u'video URL')
259 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
265 'ext': video_extension,
266 'title': video_title,
269 class HowcastIE(InfoExtractor):
270 """Information Extractor for Howcast.com"""
271 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
273 def _real_extract(self, url):
274 mobj = re.match(self._VALID_URL, url)
276 video_id = mobj.group('id')
277 webpage_url = 'http://www.howcast.com/videos/' + video_id
278 webpage = self._download_webpage(webpage_url, video_id)
280 self.report_extraction(video_id)
282 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
283 webpage, u'video URL')
285 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
288 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
289 webpage, u'description', fatal=False)
291 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
292 webpage, u'thumbnail', fatal=False)
298 'title': video_title,
299 'description': video_description,
300 'thumbnail': thumbnail,
303 class VineIE(InfoExtractor):
304 """Information Extractor for Vine.co"""
305 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
307 def _real_extract(self, url):
308 mobj = re.match(self._VALID_URL, url)
310 video_id = mobj.group('id')
311 webpage_url = 'https://vine.co/v/' + video_id
312 webpage = self._download_webpage(webpage_url, video_id)
314 self.report_extraction(video_id)
316 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
317 webpage, u'video URL')
319 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
322 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
323 webpage, u'thumbnail', fatal=False)
325 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
326 webpage, u'uploader', fatal=False, flags=re.DOTALL)
332 'title': video_title,
333 'thumbnail': thumbnail,
334 'uploader': uploader,
337 class FlickrIE(InfoExtractor):
338 """Information Extractor for Flickr videos"""
339 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
341 def _real_extract(self, url):
342 mobj = re.match(self._VALID_URL, url)
344 video_id = mobj.group('id')
345 video_uploader_id = mobj.group('uploader_id')
346 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
347 webpage = self._download_webpage(webpage_url, video_id)
349 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
351 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
352 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
354 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
355 first_xml, u'node_id')
357 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
358 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
360 self.report_extraction(video_id)
362 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
364 raise ExtractorError(u'Unable to extract video url')
365 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
367 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
368 webpage, u'video title')
370 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
371 webpage, u'description', fatal=False)
373 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
374 webpage, u'thumbnail', fatal=False)
380 'title': video_title,
381 'description': video_description,
382 'thumbnail': thumbnail,
383 'uploader_id': video_uploader_id,
386 class TeamcocoIE(InfoExtractor):
387 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
389 def _real_extract(self, url):
390 mobj = re.match(self._VALID_URL, url)
392 raise ExtractorError(u'Invalid URL: %s' % url)
393 url_title = mobj.group('url_title')
394 webpage = self._download_webpage(url, url_title)
396 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
397 webpage, u'video id')
399 self.report_extraction(video_id)
401 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
404 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
405 webpage, u'thumbnail', fatal=False)
407 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
408 webpage, u'description', fatal=False)
410 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
411 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
413 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
420 'title': video_title,
421 'thumbnail': thumbnail,
422 'description': video_description,
425 class XHamsterIE(InfoExtractor):
426 """Information Extractor for xHamster"""
427 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
429 def _real_extract(self,url):
430 mobj = re.match(self._VALID_URL, url)
432 video_id = mobj.group('id')
433 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
434 webpage = self._download_webpage(mrss_url, video_id)
436 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
438 raise ExtractorError(u'Unable to extract media URL')
439 if len(mobj.group('server')) == 0:
440 video_url = compat_urllib_parse.unquote(mobj.group('file'))
442 video_url = mobj.group('server')+'/key='+mobj.group('file')
443 video_extension = video_url.split('.')[-1]
445 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
448 # Can't see the description anywhere in the UI
449 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
450 # webpage, u'description', fatal=False)
451 # if video_description: video_description = unescapeHTML(video_description)
453 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
455 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
457 video_upload_date = None
458 self._downloader.report_warning(u'Unable to extract upload date')
460 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
461 webpage, u'uploader id', default=u'anonymous')
463 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
464 webpage, u'thumbnail', fatal=False)
469 'ext': video_extension,
470 'title': video_title,
471 # 'description': video_description,
472 'upload_date': video_upload_date,
473 'uploader_id': video_uploader_id,
474 'thumbnail': video_thumbnail
477 class HypemIE(InfoExtractor):
478 """Information Extractor for hypem"""
479 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
481 def _real_extract(self, url):
482 mobj = re.match(self._VALID_URL, url)
484 raise ExtractorError(u'Invalid URL: %s' % url)
485 track_id = mobj.group(1)
487 data = { 'ax': 1, 'ts': time.time() }
488 data_encoded = compat_urllib_parse.urlencode(data)
489 complete_url = url + "?" + data_encoded
490 request = compat_urllib_request.Request(complete_url)
491 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
492 cookie = urlh.headers.get('Set-Cookie', '')
494 self.report_extraction(track_id)
496 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
497 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
499 track_list = json.loads(html_tracks)
500 track = track_list[u'tracks'][0]
502 raise ExtractorError(u'Hypemachine contained invalid JSON.')
505 track_id = track[u"id"]
506 artist = track[u"artist"]
507 title = track[u"song"]
509 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
510 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
511 request.add_header('cookie', cookie)
512 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
514 song_data = json.loads(song_data_json)
516 raise ExtractorError(u'Hypemachine contained invalid JSON.')
517 final_url = song_data[u"url"]
527 class Vbox7IE(InfoExtractor):
528 """Information Extractor for Vbox7"""
529 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
531 def _real_extract(self,url):
532 mobj = re.match(self._VALID_URL, url)
534 raise ExtractorError(u'Invalid URL: %s' % url)
535 video_id = mobj.group(1)
537 redirect_page, urlh = self._download_webpage_handle(url, video_id)
538 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
539 redirect_url = urlh.geturl() + new_location
540 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
542 title = self._html_search_regex(r'<title>(.*)</title>',
543 webpage, u'title').split('/')[0].strip()
546 info_url = "http://vbox7.com/play/magare.do"
547 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
548 info_request = compat_urllib_request.Request(info_url, data)
549 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
550 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
551 if info_response is None:
552 raise ExtractorError(u'Unable to extract the media url')
553 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
560 'thumbnail': thumbnail_url,
564 def gen_extractors():
565 """ Return a list of an instance of every supported extractor.
566 The order does matter; the first extractor matched is the one handling the URL.
594 StanfordOpenClassroomIE(),
634 def get_info_extractor(ie_name):
635 """Returns the info extractor class with the given ie_name"""
636 return globals()[ie_name+'IE']