10 import xml.etree.ElementTree
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bandcamp import BandcampIE
24 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
25 from .extractor.comedycentral import ComedyCentralIE
26 from .extractor.collegehumor import CollegeHumorIE
27 from .extractor.dailymotion import DailymotionIE
28 from .extractor.depositfiles import DepositFilesIE
29 from .extractor.eighttracks import EightTracksIE
30 from .extractor.escapist import EscapistIE
31 from .extractor.facebook import FacebookIE
32 from .extractor.funnyordie import FunnyOrDieIE
33 from .extractor.gametrailers import GametrailersIE
34 from .extractor.generic import GenericIE
35 from .extractor.googleplus import GooglePlusIE
36 from .extractor.googlesearch import GoogleSearchIE
37 from .extractor.hypem import HypemIE
38 from .extractor.ina import InaIE
39 from .extractor.infoq import InfoQIE
40 from .extractor.justintv import JustinTVIE
41 from .extractor.keek import KeekIE
42 from .extractor.liveleak import LiveLeakIE
43 from .extractor.metacafe import MetacafeIE
44 from .extractor.mixcloud import MixcloudIE
45 from .extractor.mtv import MTVIE
46 from .extractor.myspass import MySpassIE
47 from .extractor.myvideo import MyVideoIE
48 from .extractor.nba import NBAIE
49 from .extractor.statigram import StatigramIE
50 from .extractor.photobucket import PhotobucketIE
51 from .extractor.pornotube import PornotubeIE
52 from .extractor.rbmaradio import RBMARadioIE
53 from .extractor.redtube import RedTubeIE
54 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
55 from .extractor.spiegel import SpiegelIE
56 from .extractor.stanfordoc import StanfordOpenClassroomIE
57 from .extractor.steam import SteamIE
58 from .extractor.ted import TEDIE
59 from .extractor.tumblr import TumblrIE
60 from .extractor.ustream import UstreamIE
61 from .extractor.vbox7 import Vbox7IE
62 from .extractor.vimeo import VimeoIE
63 from .extractor.vine import VineIE
64 from .extractor.worldstarhiphop import WorldStarHipHopIE
65 from .extractor.xnxx import XNXXIE
66 from .extractor.xvideos import XVideosIE
67 from .extractor.yahoo import YahooIE, YahooSearchIE
68 from .extractor.youjizz import YouJizzIE
69 from .extractor.youku import YoukuIE
70 from .extractor.youporn import YouPornIE
71 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
72 from .extractor.zdf import ZDFIE
109 class HowcastIE(InfoExtractor):
110 """Information Extractor for Howcast.com"""
111 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
113 def _real_extract(self, url):
114 mobj = re.match(self._VALID_URL, url)
116 video_id = mobj.group('id')
117 webpage_url = 'http://www.howcast.com/videos/' + video_id
118 webpage = self._download_webpage(webpage_url, video_id)
120 self.report_extraction(video_id)
122 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
123 webpage, u'video URL')
125 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
128 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
129 webpage, u'description', fatal=False)
131 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
132 webpage, u'thumbnail', fatal=False)
138 'title': video_title,
139 'description': video_description,
140 'thumbnail': thumbnail,
144 class FlickrIE(InfoExtractor):
145 """Information Extractor for Flickr videos"""
146 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
148 def _real_extract(self, url):
149 mobj = re.match(self._VALID_URL, url)
151 video_id = mobj.group('id')
152 video_uploader_id = mobj.group('uploader_id')
153 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
154 webpage = self._download_webpage(webpage_url, video_id)
156 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
158 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
159 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
161 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
162 first_xml, u'node_id')
164 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
165 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
167 self.report_extraction(video_id)
169 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
171 raise ExtractorError(u'Unable to extract video url')
172 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
174 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
175 webpage, u'video title')
177 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
178 webpage, u'description', fatal=False)
180 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
181 webpage, u'thumbnail', fatal=False)
187 'title': video_title,
188 'description': video_description,
189 'thumbnail': thumbnail,
190 'uploader_id': video_uploader_id,
193 class TeamcocoIE(InfoExtractor):
194 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
196 def _real_extract(self, url):
197 mobj = re.match(self._VALID_URL, url)
199 raise ExtractorError(u'Invalid URL: %s' % url)
200 url_title = mobj.group('url_title')
201 webpage = self._download_webpage(url, url_title)
203 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
204 webpage, u'video id')
206 self.report_extraction(video_id)
208 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
211 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
212 webpage, u'thumbnail', fatal=False)
214 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
215 webpage, u'description', fatal=False)
217 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
218 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
220 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
227 'title': video_title,
228 'thumbnail': thumbnail,
229 'description': video_description,
232 class XHamsterIE(InfoExtractor):
233 """Information Extractor for xHamster"""
234 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
236 def _real_extract(self,url):
237 mobj = re.match(self._VALID_URL, url)
239 video_id = mobj.group('id')
240 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
241 webpage = self._download_webpage(mrss_url, video_id)
243 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
245 raise ExtractorError(u'Unable to extract media URL')
246 if len(mobj.group('server')) == 0:
247 video_url = compat_urllib_parse.unquote(mobj.group('file'))
249 video_url = mobj.group('server')+'/key='+mobj.group('file')
250 video_extension = video_url.split('.')[-1]
252 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
255 # Can't see the description anywhere in the UI
256 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
257 # webpage, u'description', fatal=False)
258 # if video_description: video_description = unescapeHTML(video_description)
260 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
262 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
264 video_upload_date = None
265 self._downloader.report_warning(u'Unable to extract upload date')
267 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
268 webpage, u'uploader id', default=u'anonymous')
270 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
271 webpage, u'thumbnail', fatal=False)
276 'ext': video_extension,
277 'title': video_title,
278 # 'description': video_description,
279 'upload_date': video_upload_date,
280 'uploader_id': video_uploader_id,
281 'thumbnail': video_thumbnail
288 def gen_extractors():
289 """ Return a list of an instance of every supported extractor.
290 The order does matter; the first extractor matched is the one handling the URL.
318 StanfordOpenClassroomIE(),
358 def get_info_extractor(ie_name):
359 """Returns the info extractor class with the given ie_name"""
360 return globals()[ie_name+'IE']