10 import xml.etree.ElementTree
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bandcamp import BandcampIE
24 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
25 from .extractor.comedycentral import ComedyCentralIE
26 from .extractor.collegehumor import CollegeHumorIE
27 from .extractor.dailymotion import DailymotionIE
28 from .extractor.depositfiles import DepositFilesIE
29 from .extractor.eighttracks import EightTracksIE
30 from .extractor.escapist import EscapistIE
31 from .extractor.facebook import FacebookIE
32 from .extractor.funnyordie import FunnyOrDieIE
33 from .extractor.gametrailers import GametrailersIE
34 from .extractor.generic import GenericIE
35 from .extractor.googleplus import GooglePlusIE
36 from .extractor.googlesearch import GoogleSearchIE
37 from .extractor.ina import InaIE
38 from .extractor.infoq import InfoQIE
39 from .extractor.justintv import JustinTVIE
40 from .extractor.keek import KeekIE
41 from .extractor.liveleak import LiveLeakIE
42 from .extractor.metacafe import MetacafeIE
43 from .extractor.mixcloud import MixcloudIE
44 from .extractor.mtv import MTVIE
45 from .extractor.myspass import MySpassIE
46 from .extractor.myvideo import MyVideoIE
47 from .extractor.nba import NBAIE
48 from .extractor.statigram import StatigramIE
49 from .extractor.photobucket import PhotobucketIE
50 from .extractor.pornotube import PornotubeIE
51 from .extractor.rbmaradio import RBMARadioIE
52 from .extractor.redtube import RedTubeIE
53 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
54 from .extractor.spiegel import SpiegelIE
55 from .extractor.stanfordoc import StanfordOpenClassroomIE
56 from .extractor.steam import SteamIE
57 from .extractor.ted import TEDIE
58 from .extractor.tumblr import TumblrIE
59 from .extractor.ustream import UstreamIE
60 from .extractor.vbox7 import Vbox7IE
61 from .extractor.vimeo import VimeoIE
62 from .extractor.vine import VineIE
63 from .extractor.worldstarhiphop import WorldStarHipHopIE
64 from .extractor.xnxx import XNXXIE
65 from .extractor.xvideos import XVideosIE
66 from .extractor.yahoo import YahooIE, YahooSearchIE
67 from .extractor.youjizz import YouJizzIE
68 from .extractor.youku import YoukuIE
69 from .extractor.youporn import YouPornIE
70 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
71 from .extractor.zdf import ZDFIE
108 class HowcastIE(InfoExtractor):
109 """Information Extractor for Howcast.com"""
110 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
112 def _real_extract(self, url):
113 mobj = re.match(self._VALID_URL, url)
115 video_id = mobj.group('id')
116 webpage_url = 'http://www.howcast.com/videos/' + video_id
117 webpage = self._download_webpage(webpage_url, video_id)
119 self.report_extraction(video_id)
121 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
122 webpage, u'video URL')
124 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
127 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
128 webpage, u'description', fatal=False)
130 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
131 webpage, u'thumbnail', fatal=False)
137 'title': video_title,
138 'description': video_description,
139 'thumbnail': thumbnail,
143 class FlickrIE(InfoExtractor):
144 """Information Extractor for Flickr videos"""
145 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
147 def _real_extract(self, url):
148 mobj = re.match(self._VALID_URL, url)
150 video_id = mobj.group('id')
151 video_uploader_id = mobj.group('uploader_id')
152 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
153 webpage = self._download_webpage(webpage_url, video_id)
155 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
157 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
158 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
160 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
161 first_xml, u'node_id')
163 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
164 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
166 self.report_extraction(video_id)
168 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
170 raise ExtractorError(u'Unable to extract video url')
171 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
173 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
174 webpage, u'video title')
176 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
177 webpage, u'description', fatal=False)
179 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
180 webpage, u'thumbnail', fatal=False)
186 'title': video_title,
187 'description': video_description,
188 'thumbnail': thumbnail,
189 'uploader_id': video_uploader_id,
192 class TeamcocoIE(InfoExtractor):
193 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
195 def _real_extract(self, url):
196 mobj = re.match(self._VALID_URL, url)
198 raise ExtractorError(u'Invalid URL: %s' % url)
199 url_title = mobj.group('url_title')
200 webpage = self._download_webpage(url, url_title)
202 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
203 webpage, u'video id')
205 self.report_extraction(video_id)
207 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
210 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
211 webpage, u'thumbnail', fatal=False)
213 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
214 webpage, u'description', fatal=False)
216 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
217 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
219 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
226 'title': video_title,
227 'thumbnail': thumbnail,
228 'description': video_description,
231 class XHamsterIE(InfoExtractor):
232 """Information Extractor for xHamster"""
233 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
235 def _real_extract(self,url):
236 mobj = re.match(self._VALID_URL, url)
238 video_id = mobj.group('id')
239 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
240 webpage = self._download_webpage(mrss_url, video_id)
242 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
244 raise ExtractorError(u'Unable to extract media URL')
245 if len(mobj.group('server')) == 0:
246 video_url = compat_urllib_parse.unquote(mobj.group('file'))
248 video_url = mobj.group('server')+'/key='+mobj.group('file')
249 video_extension = video_url.split('.')[-1]
251 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
254 # Can't see the description anywhere in the UI
255 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
256 # webpage, u'description', fatal=False)
257 # if video_description: video_description = unescapeHTML(video_description)
259 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
261 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
263 video_upload_date = None
264 self._downloader.report_warning(u'Unable to extract upload date')
266 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
267 webpage, u'uploader id', default=u'anonymous')
269 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
270 webpage, u'thumbnail', fatal=False)
275 'ext': video_extension,
276 'title': video_title,
277 # 'description': video_description,
278 'upload_date': video_upload_date,
279 'uploader_id': video_uploader_id,
280 'thumbnail': video_thumbnail
283 class HypemIE(InfoExtractor):
284 """Information Extractor for hypem"""
285 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
287 def _real_extract(self, url):
288 mobj = re.match(self._VALID_URL, url)
290 raise ExtractorError(u'Invalid URL: %s' % url)
291 track_id = mobj.group(1)
293 data = { 'ax': 1, 'ts': time.time() }
294 data_encoded = compat_urllib_parse.urlencode(data)
295 complete_url = url + "?" + data_encoded
296 request = compat_urllib_request.Request(complete_url)
297 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
298 cookie = urlh.headers.get('Set-Cookie', '')
300 self.report_extraction(track_id)
302 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
303 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
305 track_list = json.loads(html_tracks)
306 track = track_list[u'tracks'][0]
308 raise ExtractorError(u'Hypemachine contained invalid JSON.')
311 track_id = track[u"id"]
312 artist = track[u"artist"]
313 title = track[u"song"]
315 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
316 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
317 request.add_header('cookie', cookie)
318 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
320 song_data = json.loads(song_data_json)
322 raise ExtractorError(u'Hypemachine contained invalid JSON.')
323 final_url = song_data[u"url"]
335 def gen_extractors():
336 """ Return a list of an instance of every supported extractor.
337 The order does matter; the first extractor matched is the one handling the URL.
365 StanfordOpenClassroomIE(),
405 def get_info_extractor(ie_name):
406 """Returns the info extractor class with the given ie_name"""
407 return globals()[ie_name+'IE']