10 import xml.etree.ElementTree
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bandcamp import BandcampIE
24 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
25 from .extractor.comedycentral import ComedyCentralIE
26 from .extractor.collegehumor import CollegeHumorIE
27 from .extractor.dailymotion import DailymotionIE
28 from .extractor.depositfiles import DepositFilesIE
29 from .extractor.eighttracks import EightTracksIE
30 from .extractor.escapist import EscapistIE
31 from .extractor.facebook import FacebookIE
32 from .extractor.funnyordie import FunnyOrDieIE
33 from .extractor.gametrailers import GametrailersIE
34 from .extractor.generic import GenericIE
35 from .extractor.googleplus import GooglePlusIE
36 from .extractor.googlesearch import GoogleSearchIE
37 from .extractor.infoq import InfoQIE
38 from .extractor.justintv import JustinTVIE
39 from .extractor.keek import KeekIE
40 from .extractor.liveleak import LiveLeakIE
41 from .extractor.metacafe import MetacafeIE
42 from .extractor.mixcloud import MixcloudIE
43 from .extractor.mtv import MTVIE
44 from .extractor.myspass import MySpassIE
45 from .extractor.myvideo import MyVideoIE
46 from .extractor.nba import NBAIE
47 from .extractor.statigram import StatigramIE
48 from .extractor.photobucket import PhotobucketIE
49 from .extractor.pornotube import PornotubeIE
50 from .extractor.rbmaradio import RBMARadioIE
51 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
52 from .extractor.spiegel import SpiegelIE
53 from .extractor.stanfordoc import StanfordOpenClassroomIE
54 from .extractor.steam import SteamIE
55 from .extractor.ted import TEDIE
56 from .extractor.tumblr import TumblrIE
57 from .extractor.ustream import UstreamIE
58 from .extractor.vbox7 import Vbox7IE
59 from .extractor.vimeo import VimeoIE
60 from .extractor.worldstarhiphop import WorldStarHipHopIE
61 from .extractor.xnxx import XNXXIE
62 from .extractor.xvideos import XVideosIE
63 from .extractor.yahoo import YahooIE, YahooSearchIE
64 from .extractor.youjizz import YouJizzIE
65 from .extractor.youku import YoukuIE
66 from .extractor.youporn import YouPornIE
67 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
68 from .extractor.zdf import ZDFIE
104 class RedTubeIE(InfoExtractor):
105 """Information Extractor for redtube"""
106 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
108 def _real_extract(self,url):
109 mobj = re.match(self._VALID_URL, url)
111 raise ExtractorError(u'Invalid URL: %s' % url)
113 video_id = mobj.group('id')
114 video_extension = 'mp4'
115 webpage = self._download_webpage(url, video_id)
117 self.report_extraction(video_id)
119 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
120 webpage, u'video URL')
122 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
128 'ext': video_extension,
129 'title': video_title,
132 class InaIE(InfoExtractor):
133 """Information Extractor for Ina.fr"""
134 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
136 def _real_extract(self,url):
137 mobj = re.match(self._VALID_URL, url)
139 video_id = mobj.group('id')
140 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
141 video_extension = 'mp4'
142 webpage = self._download_webpage(mrss_url, video_id)
144 self.report_extraction(video_id)
146 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
147 webpage, u'video URL')
149 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
155 'ext': video_extension,
156 'title': video_title,
159 class HowcastIE(InfoExtractor):
160 """Information Extractor for Howcast.com"""
161 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
163 def _real_extract(self, url):
164 mobj = re.match(self._VALID_URL, url)
166 video_id = mobj.group('id')
167 webpage_url = 'http://www.howcast.com/videos/' + video_id
168 webpage = self._download_webpage(webpage_url, video_id)
170 self.report_extraction(video_id)
172 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
173 webpage, u'video URL')
175 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
178 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
179 webpage, u'description', fatal=False)
181 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
182 webpage, u'thumbnail', fatal=False)
188 'title': video_title,
189 'description': video_description,
190 'thumbnail': thumbnail,
193 class VineIE(InfoExtractor):
194 """Information Extractor for Vine.co"""
195 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
197 def _real_extract(self, url):
198 mobj = re.match(self._VALID_URL, url)
200 video_id = mobj.group('id')
201 webpage_url = 'https://vine.co/v/' + video_id
202 webpage = self._download_webpage(webpage_url, video_id)
204 self.report_extraction(video_id)
206 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
207 webpage, u'video URL')
209 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
212 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
213 webpage, u'thumbnail', fatal=False)
215 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
216 webpage, u'uploader', fatal=False, flags=re.DOTALL)
222 'title': video_title,
223 'thumbnail': thumbnail,
224 'uploader': uploader,
227 class FlickrIE(InfoExtractor):
228 """Information Extractor for Flickr videos"""
229 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
231 def _real_extract(self, url):
232 mobj = re.match(self._VALID_URL, url)
234 video_id = mobj.group('id')
235 video_uploader_id = mobj.group('uploader_id')
236 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
237 webpage = self._download_webpage(webpage_url, video_id)
239 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
241 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
242 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
244 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
245 first_xml, u'node_id')
247 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
248 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
250 self.report_extraction(video_id)
252 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
254 raise ExtractorError(u'Unable to extract video url')
255 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
257 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
258 webpage, u'video title')
260 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
261 webpage, u'description', fatal=False)
263 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
264 webpage, u'thumbnail', fatal=False)
270 'title': video_title,
271 'description': video_description,
272 'thumbnail': thumbnail,
273 'uploader_id': video_uploader_id,
276 class TeamcocoIE(InfoExtractor):
277 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
279 def _real_extract(self, url):
280 mobj = re.match(self._VALID_URL, url)
282 raise ExtractorError(u'Invalid URL: %s' % url)
283 url_title = mobj.group('url_title')
284 webpage = self._download_webpage(url, url_title)
286 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
287 webpage, u'video id')
289 self.report_extraction(video_id)
291 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
294 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
295 webpage, u'thumbnail', fatal=False)
297 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
298 webpage, u'description', fatal=False)
300 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
301 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
303 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
310 'title': video_title,
311 'thumbnail': thumbnail,
312 'description': video_description,
315 class XHamsterIE(InfoExtractor):
316 """Information Extractor for xHamster"""
317 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
319 def _real_extract(self,url):
320 mobj = re.match(self._VALID_URL, url)
322 video_id = mobj.group('id')
323 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
324 webpage = self._download_webpage(mrss_url, video_id)
326 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
328 raise ExtractorError(u'Unable to extract media URL')
329 if len(mobj.group('server')) == 0:
330 video_url = compat_urllib_parse.unquote(mobj.group('file'))
332 video_url = mobj.group('server')+'/key='+mobj.group('file')
333 video_extension = video_url.split('.')[-1]
335 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
338 # Can't see the description anywhere in the UI
339 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
340 # webpage, u'description', fatal=False)
341 # if video_description: video_description = unescapeHTML(video_description)
343 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
345 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
347 video_upload_date = None
348 self._downloader.report_warning(u'Unable to extract upload date')
350 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
351 webpage, u'uploader id', default=u'anonymous')
353 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
354 webpage, u'thumbnail', fatal=False)
359 'ext': video_extension,
360 'title': video_title,
361 # 'description': video_description,
362 'upload_date': video_upload_date,
363 'uploader_id': video_uploader_id,
364 'thumbnail': video_thumbnail
367 class HypemIE(InfoExtractor):
368 """Information Extractor for hypem"""
369 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
371 def _real_extract(self, url):
372 mobj = re.match(self._VALID_URL, url)
374 raise ExtractorError(u'Invalid URL: %s' % url)
375 track_id = mobj.group(1)
377 data = { 'ax': 1, 'ts': time.time() }
378 data_encoded = compat_urllib_parse.urlencode(data)
379 complete_url = url + "?" + data_encoded
380 request = compat_urllib_request.Request(complete_url)
381 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
382 cookie = urlh.headers.get('Set-Cookie', '')
384 self.report_extraction(track_id)
386 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
387 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
389 track_list = json.loads(html_tracks)
390 track = track_list[u'tracks'][0]
392 raise ExtractorError(u'Hypemachine contained invalid JSON.')
395 track_id = track[u"id"]
396 artist = track[u"artist"]
397 title = track[u"song"]
399 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
400 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
401 request.add_header('cookie', cookie)
402 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
404 song_data = json.loads(song_data_json)
406 raise ExtractorError(u'Hypemachine contained invalid JSON.')
407 final_url = song_data[u"url"]
419 def gen_extractors():
420 """ Return a list of an instance of every supported extractor.
421 The order does matter; the first extractor matched is the one handling the URL.
449 StanfordOpenClassroomIE(),
489 def get_info_extractor(ie_name):
490 """Returns the info extractor class with the given ie_name"""
491 return globals()[ie_name+'IE']