10 import xml.etree.ElementTree
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.collegehumor import CollegeHumorIE
26 from .extractor.dailymotion import DailymotionIE
27 from .extractor.depositfiles import DepositFilesIE
28 from .extractor.eighttracks import EightTracksIE
29 from .extractor.escapist import EscapistIE
30 from .extractor.facebook import FacebookIE
31 from .extractor.funnyordie import FunnyOrDieIE
32 from .extractor.gametrailers import GametrailersIE
33 from .extractor.generic import GenericIE
34 from .extractor.googleplus import GooglePlusIE
35 from .extractor.googlesearch import GoogleSearchIE
36 from .extractor.infoq import InfoQIE
37 from .extractor.justintv import JustinTVIE
38 from .extractor.keek import KeekIE
39 from .extractor.liveleak import LiveLeakIE
40 from .extractor.metacafe import MetacafeIE
41 from .extractor.mixcloud import MixcloudIE
42 from .extractor.mtv import MTVIE
43 from .extractor.myspass import MySpassIE
44 from .extractor.myvideo import MyVideoIE
45 from .extractor.nba import NBAIE
46 from .extractor.statigram import StatigramIE
47 from .extractor.photobucket import PhotobucketIE
48 from .extractor.pornotube import PornotubeIE
49 from .extractor.rbmaradio import RBMARadioIE
50 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
51 from .extractor.spiegel import SpiegelIE
52 from .extractor.stanfordoc import StanfordOpenClassroomIE
53 from .extractor.steam import SteamIE
54 from .extractor.ted import TEDIE
55 from .extractor.tumblr import TumblrIE
56 from .extractor.ustream import UstreamIE
57 from .extractor.vimeo import VimeoIE
58 from .extractor.worldstarhiphop import WorldStarHipHopIE
59 from .extractor.xnxx import XNXXIE
60 from .extractor.xvideos import XVideosIE
61 from .extractor.yahoo import YahooIE, YahooSearchIE
62 from .extractor.youjizz import YouJizzIE
63 from .extractor.youku import YoukuIE
64 from .extractor.youporn import YouPornIE
65 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
66 from .extractor.zdf import ZDFIE
101 class BandcampIE(InfoExtractor):
102 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
104 def _real_extract(self, url):
105 mobj = re.match(self._VALID_URL, url)
106 title = mobj.group('title')
107 webpage = self._download_webpage(url, title)
108 # We get the link to the free download page
109 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
110 if m_download is None:
111 raise ExtractorError(u'No free songs found')
113 download_link = m_download.group(1)
114 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
115 webpage, re.MULTILINE|re.DOTALL).group('id')
117 download_webpage = self._download_webpage(download_link, id,
118 'Downloading free downloads page')
119 # We get the dictionary of the track from some javascrip code
120 info = re.search(r'items: (.*?),$',
121 download_webpage, re.MULTILINE).group(1)
122 info = json.loads(info)[0]
123 # We pick mp3-320 for now, until format selection can be easily implemented.
124 mp3_info = info[u'downloads'][u'mp3-320']
125 # If we try to use this url it says the link has expired
126 initial_url = mp3_info[u'url']
127 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
128 m_url = re.match(re_url, initial_url)
129 #We build the url we will use to get the final track url
130 # This url is build in Bandcamp in the script download_bunde_*.js
131 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
132 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
133 # If we could correctly generate the .rand field the url would be
134 #in the "download_url" key
135 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
137 track_info = {'id':id,
138 'title' : info[u'title'],
141 'thumbnail' : info[u'thumb_url'],
142 'uploader' : info[u'artist']
147 class RedTubeIE(InfoExtractor):
148 """Information Extractor for redtube"""
149 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
151 def _real_extract(self,url):
152 mobj = re.match(self._VALID_URL, url)
154 raise ExtractorError(u'Invalid URL: %s' % url)
156 video_id = mobj.group('id')
157 video_extension = 'mp4'
158 webpage = self._download_webpage(url, video_id)
160 self.report_extraction(video_id)
162 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
163 webpage, u'video URL')
165 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
171 'ext': video_extension,
172 'title': video_title,
175 class InaIE(InfoExtractor):
176 """Information Extractor for Ina.fr"""
177 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
179 def _real_extract(self,url):
180 mobj = re.match(self._VALID_URL, url)
182 video_id = mobj.group('id')
183 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
184 video_extension = 'mp4'
185 webpage = self._download_webpage(mrss_url, video_id)
187 self.report_extraction(video_id)
189 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
190 webpage, u'video URL')
192 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
198 'ext': video_extension,
199 'title': video_title,
202 class HowcastIE(InfoExtractor):
203 """Information Extractor for Howcast.com"""
204 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
206 def _real_extract(self, url):
207 mobj = re.match(self._VALID_URL, url)
209 video_id = mobj.group('id')
210 webpage_url = 'http://www.howcast.com/videos/' + video_id
211 webpage = self._download_webpage(webpage_url, video_id)
213 self.report_extraction(video_id)
215 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
216 webpage, u'video URL')
218 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
221 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
222 webpage, u'description', fatal=False)
224 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
225 webpage, u'thumbnail', fatal=False)
231 'title': video_title,
232 'description': video_description,
233 'thumbnail': thumbnail,
236 class VineIE(InfoExtractor):
237 """Information Extractor for Vine.co"""
238 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
240 def _real_extract(self, url):
241 mobj = re.match(self._VALID_URL, url)
243 video_id = mobj.group('id')
244 webpage_url = 'https://vine.co/v/' + video_id
245 webpage = self._download_webpage(webpage_url, video_id)
247 self.report_extraction(video_id)
249 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
250 webpage, u'video URL')
252 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
255 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
256 webpage, u'thumbnail', fatal=False)
258 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
259 webpage, u'uploader', fatal=False, flags=re.DOTALL)
265 'title': video_title,
266 'thumbnail': thumbnail,
267 'uploader': uploader,
270 class FlickrIE(InfoExtractor):
271 """Information Extractor for Flickr videos"""
272 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
274 def _real_extract(self, url):
275 mobj = re.match(self._VALID_URL, url)
277 video_id = mobj.group('id')
278 video_uploader_id = mobj.group('uploader_id')
279 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
280 webpage = self._download_webpage(webpage_url, video_id)
282 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
284 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
285 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
287 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
288 first_xml, u'node_id')
290 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
291 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
293 self.report_extraction(video_id)
295 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
297 raise ExtractorError(u'Unable to extract video url')
298 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
300 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
301 webpage, u'video title')
303 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
304 webpage, u'description', fatal=False)
306 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
307 webpage, u'thumbnail', fatal=False)
313 'title': video_title,
314 'description': video_description,
315 'thumbnail': thumbnail,
316 'uploader_id': video_uploader_id,
319 class TeamcocoIE(InfoExtractor):
320 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
322 def _real_extract(self, url):
323 mobj = re.match(self._VALID_URL, url)
325 raise ExtractorError(u'Invalid URL: %s' % url)
326 url_title = mobj.group('url_title')
327 webpage = self._download_webpage(url, url_title)
329 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
330 webpage, u'video id')
332 self.report_extraction(video_id)
334 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
337 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
338 webpage, u'thumbnail', fatal=False)
340 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
341 webpage, u'description', fatal=False)
343 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
344 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
346 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
353 'title': video_title,
354 'thumbnail': thumbnail,
355 'description': video_description,
358 class XHamsterIE(InfoExtractor):
359 """Information Extractor for xHamster"""
360 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
362 def _real_extract(self,url):
363 mobj = re.match(self._VALID_URL, url)
365 video_id = mobj.group('id')
366 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
367 webpage = self._download_webpage(mrss_url, video_id)
369 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
371 raise ExtractorError(u'Unable to extract media URL')
372 if len(mobj.group('server')) == 0:
373 video_url = compat_urllib_parse.unquote(mobj.group('file'))
375 video_url = mobj.group('server')+'/key='+mobj.group('file')
376 video_extension = video_url.split('.')[-1]
378 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
381 # Can't see the description anywhere in the UI
382 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
383 # webpage, u'description', fatal=False)
384 # if video_description: video_description = unescapeHTML(video_description)
386 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
388 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
390 video_upload_date = None
391 self._downloader.report_warning(u'Unable to extract upload date')
393 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
394 webpage, u'uploader id', default=u'anonymous')
396 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
397 webpage, u'thumbnail', fatal=False)
402 'ext': video_extension,
403 'title': video_title,
404 # 'description': video_description,
405 'upload_date': video_upload_date,
406 'uploader_id': video_uploader_id,
407 'thumbnail': video_thumbnail
410 class HypemIE(InfoExtractor):
411 """Information Extractor for hypem"""
412 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
414 def _real_extract(self, url):
415 mobj = re.match(self._VALID_URL, url)
417 raise ExtractorError(u'Invalid URL: %s' % url)
418 track_id = mobj.group(1)
420 data = { 'ax': 1, 'ts': time.time() }
421 data_encoded = compat_urllib_parse.urlencode(data)
422 complete_url = url + "?" + data_encoded
423 request = compat_urllib_request.Request(complete_url)
424 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
425 cookie = urlh.headers.get('Set-Cookie', '')
427 self.report_extraction(track_id)
429 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
430 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
432 track_list = json.loads(html_tracks)
433 track = track_list[u'tracks'][0]
435 raise ExtractorError(u'Hypemachine contained invalid JSON.')
438 track_id = track[u"id"]
439 artist = track[u"artist"]
440 title = track[u"song"]
442 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
443 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
444 request.add_header('cookie', cookie)
445 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
447 song_data = json.loads(song_data_json)
449 raise ExtractorError(u'Hypemachine contained invalid JSON.')
450 final_url = song_data[u"url"]
460 class Vbox7IE(InfoExtractor):
461 """Information Extractor for Vbox7"""
462 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
464 def _real_extract(self,url):
465 mobj = re.match(self._VALID_URL, url)
467 raise ExtractorError(u'Invalid URL: %s' % url)
468 video_id = mobj.group(1)
470 redirect_page, urlh = self._download_webpage_handle(url, video_id)
471 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
472 redirect_url = urlh.geturl() + new_location
473 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
475 title = self._html_search_regex(r'<title>(.*)</title>',
476 webpage, u'title').split('/')[0].strip()
479 info_url = "http://vbox7.com/play/magare.do"
480 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
481 info_request = compat_urllib_request.Request(info_url, data)
482 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
483 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
484 if info_response is None:
485 raise ExtractorError(u'Unable to extract the media url')
486 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
493 'thumbnail': thumbnail_url,
497 def gen_extractors():
498 """ Return a list of an instance of every supported extractor.
499 The order does matter; the first extractor matched is the one handling the URL.
527 StanfordOpenClassroomIE(),
567 def get_info_extractor(ie_name):
568 """Returns the info extractor class with the given ie_name"""
569 return globals()[ie_name+'IE']