10 import xml.etree.ElementTree
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.collegehumor import CollegeHumorIE
26 from .extractor.dailymotion import DailymotionIE
27 from .extractor.depositfiles import DepositFilesIE
28 from .extractor.eighttracks import EightTracksIE
29 from .extractor.escapist import EscapistIE
30 from .extractor.facebook import FacebookIE
31 from .extractor.funnyordie import FunnyOrDieIE
32 from .extractor.gametrailers import GametrailersIE
33 from .extractor.generic import GenericIE
34 from .extractor.googleplus import GooglePlusIE
35 from .extractor.googlesearch import GoogleSearchIE
36 from .extractor.infoq import InfoQIE
37 from .extractor.justintv import JustinTVIE
38 from .extractor.keek import KeekIE
39 from .extractor.liveleak import LiveLeakIE
40 from .extractor.metacafe import MetacafeIE
41 from .extractor.mixcloud import MixcloudIE
42 from .extractor.mtv import MTVIE
43 from .extractor.myspass import MySpassIE
44 from .extractor.myvideo import MyVideoIE
45 from .extractor.nba import NBAIE
46 from .extractor.statigram import StatigramIE
47 from .extractor.photobucket import PhotobucketIE
48 from .extractor.pornotube import PornotubeIE
49 from .extractor.rbmaradio import RBMARadioIE
50 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
51 from .extractor.spiegel import SpiegelIE
52 from .extractor.stanfordoc import StanfordOpenClassroomIE
53 from .extractor.steam import SteamIE
54 from .extractor.ted import TEDIE
55 from .extractor.ustream import UstreamIE
56 from .extractor.vimeo import VimeoIE
57 from .extractor.worldstarhiphop import WorldStarHipHopIE
58 from .extractor.xnxx import XNXXIE
59 from .extractor.xvideos import XVideosIE
60 from .extractor.yahoo import YahooIE, YahooSearchIE
61 from .extractor.youjizz import YouJizzIE
62 from .extractor.youku import YoukuIE
63 from .extractor.youporn import YouPornIE
64 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
65 from .extractor.zdf import ZDFIE
99 class TumblrIE(InfoExtractor):
100 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
102 def _real_extract(self, url):
103 m_url = re.match(self._VALID_URL, url)
104 video_id = m_url.group('id')
105 blog = m_url.group('blog_name')
107 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
108 webpage = self._download_webpage(url, video_id)
110 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
111 video = re.search(re_video, webpage)
113 raise ExtractorError(u'Unable to extract video')
114 video_url = video.group('video_url')
115 ext = video.group('ext')
117 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
118 webpage, u'thumbnail', fatal=False) # We pick the first poster
119 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
121 # The only place where you can get a title, it's not complete,
122 # but searching in other places doesn't work for all videos
123 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
124 webpage, u'title', flags=re.DOTALL)
126 return [{'id': video_id,
128 'title': video_title,
129 'thumbnail': video_thumbnail,
133 class BandcampIE(InfoExtractor):
134 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
136 def _real_extract(self, url):
137 mobj = re.match(self._VALID_URL, url)
138 title = mobj.group('title')
139 webpage = self._download_webpage(url, title)
140 # We get the link to the free download page
141 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
142 if m_download is None:
143 raise ExtractorError(u'No free songs found')
145 download_link = m_download.group(1)
146 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
147 webpage, re.MULTILINE|re.DOTALL).group('id')
149 download_webpage = self._download_webpage(download_link, id,
150 'Downloading free downloads page')
151 # We get the dictionary of the track from some javascrip code
152 info = re.search(r'items: (.*?),$',
153 download_webpage, re.MULTILINE).group(1)
154 info = json.loads(info)[0]
155 # We pick mp3-320 for now, until format selection can be easily implemented.
156 mp3_info = info[u'downloads'][u'mp3-320']
157 # If we try to use this url it says the link has expired
158 initial_url = mp3_info[u'url']
159 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
160 m_url = re.match(re_url, initial_url)
161 #We build the url we will use to get the final track url
162 # This url is build in Bandcamp in the script download_bunde_*.js
163 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
164 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
165 # If we could correctly generate the .rand field the url would be
166 #in the "download_url" key
167 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
169 track_info = {'id':id,
170 'title' : info[u'title'],
173 'thumbnail' : info[u'thumb_url'],
174 'uploader' : info[u'artist']
179 class RedTubeIE(InfoExtractor):
180 """Information Extractor for redtube"""
181 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
183 def _real_extract(self,url):
184 mobj = re.match(self._VALID_URL, url)
186 raise ExtractorError(u'Invalid URL: %s' % url)
188 video_id = mobj.group('id')
189 video_extension = 'mp4'
190 webpage = self._download_webpage(url, video_id)
192 self.report_extraction(video_id)
194 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
195 webpage, u'video URL')
197 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
203 'ext': video_extension,
204 'title': video_title,
207 class InaIE(InfoExtractor):
208 """Information Extractor for Ina.fr"""
209 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
211 def _real_extract(self,url):
212 mobj = re.match(self._VALID_URL, url)
214 video_id = mobj.group('id')
215 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
216 video_extension = 'mp4'
217 webpage = self._download_webpage(mrss_url, video_id)
219 self.report_extraction(video_id)
221 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
222 webpage, u'video URL')
224 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
230 'ext': video_extension,
231 'title': video_title,
234 class HowcastIE(InfoExtractor):
235 """Information Extractor for Howcast.com"""
236 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
238 def _real_extract(self, url):
239 mobj = re.match(self._VALID_URL, url)
241 video_id = mobj.group('id')
242 webpage_url = 'http://www.howcast.com/videos/' + video_id
243 webpage = self._download_webpage(webpage_url, video_id)
245 self.report_extraction(video_id)
247 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
248 webpage, u'video URL')
250 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
253 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
254 webpage, u'description', fatal=False)
256 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
257 webpage, u'thumbnail', fatal=False)
263 'title': video_title,
264 'description': video_description,
265 'thumbnail': thumbnail,
268 class VineIE(InfoExtractor):
269 """Information Extractor for Vine.co"""
270 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
272 def _real_extract(self, url):
273 mobj = re.match(self._VALID_URL, url)
275 video_id = mobj.group('id')
276 webpage_url = 'https://vine.co/v/' + video_id
277 webpage = self._download_webpage(webpage_url, video_id)
279 self.report_extraction(video_id)
281 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
282 webpage, u'video URL')
284 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
287 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
288 webpage, u'thumbnail', fatal=False)
290 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
291 webpage, u'uploader', fatal=False, flags=re.DOTALL)
297 'title': video_title,
298 'thumbnail': thumbnail,
299 'uploader': uploader,
302 class FlickrIE(InfoExtractor):
303 """Information Extractor for Flickr videos"""
304 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
306 def _real_extract(self, url):
307 mobj = re.match(self._VALID_URL, url)
309 video_id = mobj.group('id')
310 video_uploader_id = mobj.group('uploader_id')
311 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
312 webpage = self._download_webpage(webpage_url, video_id)
314 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
316 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
317 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
319 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
320 first_xml, u'node_id')
322 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
323 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
325 self.report_extraction(video_id)
327 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
329 raise ExtractorError(u'Unable to extract video url')
330 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
332 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
333 webpage, u'video title')
335 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
336 webpage, u'description', fatal=False)
338 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
339 webpage, u'thumbnail', fatal=False)
345 'title': video_title,
346 'description': video_description,
347 'thumbnail': thumbnail,
348 'uploader_id': video_uploader_id,
351 class TeamcocoIE(InfoExtractor):
352 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
354 def _real_extract(self, url):
355 mobj = re.match(self._VALID_URL, url)
357 raise ExtractorError(u'Invalid URL: %s' % url)
358 url_title = mobj.group('url_title')
359 webpage = self._download_webpage(url, url_title)
361 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
362 webpage, u'video id')
364 self.report_extraction(video_id)
366 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
369 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
370 webpage, u'thumbnail', fatal=False)
372 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
373 webpage, u'description', fatal=False)
375 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
376 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
378 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
385 'title': video_title,
386 'thumbnail': thumbnail,
387 'description': video_description,
390 class XHamsterIE(InfoExtractor):
391 """Information Extractor for xHamster"""
392 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
394 def _real_extract(self,url):
395 mobj = re.match(self._VALID_URL, url)
397 video_id = mobj.group('id')
398 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
399 webpage = self._download_webpage(mrss_url, video_id)
401 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
403 raise ExtractorError(u'Unable to extract media URL')
404 if len(mobj.group('server')) == 0:
405 video_url = compat_urllib_parse.unquote(mobj.group('file'))
407 video_url = mobj.group('server')+'/key='+mobj.group('file')
408 video_extension = video_url.split('.')[-1]
410 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
413 # Can't see the description anywhere in the UI
414 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
415 # webpage, u'description', fatal=False)
416 # if video_description: video_description = unescapeHTML(video_description)
418 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
420 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
422 video_upload_date = None
423 self._downloader.report_warning(u'Unable to extract upload date')
425 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
426 webpage, u'uploader id', default=u'anonymous')
428 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
429 webpage, u'thumbnail', fatal=False)
434 'ext': video_extension,
435 'title': video_title,
436 # 'description': video_description,
437 'upload_date': video_upload_date,
438 'uploader_id': video_uploader_id,
439 'thumbnail': video_thumbnail
442 class HypemIE(InfoExtractor):
443 """Information Extractor for hypem"""
444 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
446 def _real_extract(self, url):
447 mobj = re.match(self._VALID_URL, url)
449 raise ExtractorError(u'Invalid URL: %s' % url)
450 track_id = mobj.group(1)
452 data = { 'ax': 1, 'ts': time.time() }
453 data_encoded = compat_urllib_parse.urlencode(data)
454 complete_url = url + "?" + data_encoded
455 request = compat_urllib_request.Request(complete_url)
456 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
457 cookie = urlh.headers.get('Set-Cookie', '')
459 self.report_extraction(track_id)
461 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
462 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
464 track_list = json.loads(html_tracks)
465 track = track_list[u'tracks'][0]
467 raise ExtractorError(u'Hypemachine contained invalid JSON.')
470 track_id = track[u"id"]
471 artist = track[u"artist"]
472 title = track[u"song"]
474 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
475 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
476 request.add_header('cookie', cookie)
477 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
479 song_data = json.loads(song_data_json)
481 raise ExtractorError(u'Hypemachine contained invalid JSON.')
482 final_url = song_data[u"url"]
492 class Vbox7IE(InfoExtractor):
493 """Information Extractor for Vbox7"""
494 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
496 def _real_extract(self,url):
497 mobj = re.match(self._VALID_URL, url)
499 raise ExtractorError(u'Invalid URL: %s' % url)
500 video_id = mobj.group(1)
502 redirect_page, urlh = self._download_webpage_handle(url, video_id)
503 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
504 redirect_url = urlh.geturl() + new_location
505 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
507 title = self._html_search_regex(r'<title>(.*)</title>',
508 webpage, u'title').split('/')[0].strip()
511 info_url = "http://vbox7.com/play/magare.do"
512 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
513 info_request = compat_urllib_request.Request(info_url, data)
514 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
515 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
516 if info_response is None:
517 raise ExtractorError(u'Unable to extract the media url')
518 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
525 'thumbnail': thumbnail_url,
529 def gen_extractors():
530 """ Return a list of an instance of every supported extractor.
531 The order does matter; the first extractor matched is the one handling the URL.
559 StanfordOpenClassroomIE(),
599 def get_info_extractor(ie_name):
600 """Returns the info extractor class with the given ie_name"""
601 return globals()[ie_name+'IE']