10 import xml.etree.ElementTree
19 from .extractor.common import InfoExtractor, SearchInfoExtractor
21 from .extractor.ard import ARDIE
22 from .extractor.arte import ArteTvIE
23 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
24 from .extractor.comedycentral import ComedyCentralIE
25 from .extractor.collegehumor import CollegeHumorIE
26 from .extractor.dailymotion import DailymotionIE
27 from .extractor.depositfiles import DepositFilesIE
28 from .extractor.eighttracks import EightTracksIE
29 from .extractor.escapist import EscapistIE
30 from .extractor.facebook import FacebookIE
31 from .extractor.funnyordie import FunnyOrDieIE
32 from .extractor.gametrailers import GametrailersIE
33 from .extractor.generic import GenericIE
34 from .extractor.googleplus import GooglePlusIE
35 from .extractor.googlesearch import GoogleSearchIE
36 from .extractor.infoq import InfoQIE
37 from .extractor.justintv import JustinTVIE
38 from .extractor.keek import KeekIE
39 from .extractor.metacafe import MetacafeIE
40 from .extractor.mixcloud import MixcloudIE
41 from .extractor.mtv import MTVIE
42 from .extractor.myvideo import MyVideoIE
43 from .extractor.nba import NBAIE
44 from .extractor.statigram import StatigramIE
45 from .extractor.photobucket import PhotobucketIE
46 from .extractor.pornotube import PornotubeIE
47 from .extractor.rbmaradio import RBMARadioIE
48 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
49 from .extractor.stanfordoc import StanfordOpenClassroomIE
50 from .extractor.steam import SteamIE
51 from .extractor.ted import TEDIE
52 from .extractor.ustream import UstreamIE
53 from .extractor.vimeo import VimeoIE
54 from .extractor.worldstarhiphop import WorldStarHipHopIE
55 from .extractor.xnxx import XNXXIE
56 from .extractor.xvideos import XVideosIE
57 from .extractor.yahoo import YahooIE, YahooSearchIE
58 from .extractor.youjizz import YouJizzIE
59 from .extractor.youku import YoukuIE
60 from .extractor.youporn import YouPornIE
61 from .extractor.youtube import YoutubeIE, YoutubePlaylistIE, YoutubeSearchIE, YoutubeUserIE, YoutubeChannelIE
62 from .extractor.zdf import ZDFIE
91 class MySpassIE(InfoExtractor):
92 _VALID_URL = r'http://www.myspass.de/.*'
94 def _real_extract(self, url):
95 META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
97 # video id is the last path element of the URL
98 # usually there is a trailing slash, so also try the second but last
99 url_path = compat_urllib_parse_urlparse(url).path
100 url_parent_path, video_id = os.path.split(url_path)
102 _, video_id = os.path.split(url_parent_path)
105 metadata_url = META_DATA_URL_TEMPLATE % video_id
106 metadata_text = self._download_webpage(metadata_url, video_id)
107 metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
109 # extract values from metadata
110 url_flv_el = metadata.find('url_flv')
111 if url_flv_el is None:
112 raise ExtractorError(u'Unable to extract download url')
113 video_url = url_flv_el.text
114 extension = os.path.splitext(video_url)[1][1:]
115 title_el = metadata.find('title')
117 raise ExtractorError(u'Unable to extract title')
118 title = title_el.text
119 format_id_el = metadata.find('format_id')
120 if format_id_el is None:
123 format = format_id_el.text
124 description_el = metadata.find('description')
125 if description_el is not None:
126 description = description_el.text
129 imagePreview_el = metadata.find('imagePreview')
130 if imagePreview_el is not None:
131 thumbnail = imagePreview_el.text
140 'thumbnail': thumbnail,
141 'description': description
145 class SpiegelIE(InfoExtractor):
146 _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
148 def _real_extract(self, url):
149 m = re.match(self._VALID_URL, url)
150 video_id = m.group('videoID')
152 webpage = self._download_webpage(url, video_id)
154 video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
157 xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
158 xml_code = self._download_webpage(xml_url, video_id,
159 note=u'Downloading XML', errnote=u'Failed to download XML')
161 idoc = xml.etree.ElementTree.fromstring(xml_code)
163 filename = last_type.findall('./filename')[0].text
164 duration = float(last_type.findall('./duration')[0].text)
166 video_url = 'http://video2.spiegel.de/flash/' + filename
167 video_ext = filename.rpartition('.')[2]
172 'title': video_title,
173 'duration': duration,
177 class LiveLeakIE(InfoExtractor):
179 _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
180 IE_NAME = u'liveleak'
182 def _real_extract(self, url):
183 mobj = re.match(self._VALID_URL, url)
185 raise ExtractorError(u'Invalid URL: %s' % url)
187 video_id = mobj.group('video_id')
189 webpage = self._download_webpage(url, video_id)
191 video_url = self._search_regex(r'file: "(.*?)",',
192 webpage, u'video URL')
194 video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
195 webpage, u'title').replace('LiveLeak.com -', '').strip()
197 video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
198 webpage, u'description', fatal=False)
200 video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
201 webpage, u'uploader', fatal=False)
207 'title': video_title,
208 'description': video_description,
209 'uploader': video_uploader
216 class TumblrIE(InfoExtractor):
217 _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
219 def _real_extract(self, url):
220 m_url = re.match(self._VALID_URL, url)
221 video_id = m_url.group('id')
222 blog = m_url.group('blog_name')
224 url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
225 webpage = self._download_webpage(url, video_id)
227 re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
228 video = re.search(re_video, webpage)
230 raise ExtractorError(u'Unable to extract video')
231 video_url = video.group('video_url')
232 ext = video.group('ext')
234 video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
235 webpage, u'thumbnail', fatal=False) # We pick the first poster
236 if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
238 # The only place where you can get a title, it's not complete,
239 # but searching in other places doesn't work for all videos
240 video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
241 webpage, u'title', flags=re.DOTALL)
243 return [{'id': video_id,
245 'title': video_title,
246 'thumbnail': video_thumbnail,
250 class BandcampIE(InfoExtractor):
251 _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
253 def _real_extract(self, url):
254 mobj = re.match(self._VALID_URL, url)
255 title = mobj.group('title')
256 webpage = self._download_webpage(url, title)
257 # We get the link to the free download page
258 m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
259 if m_download is None:
260 raise ExtractorError(u'No free songs found')
262 download_link = m_download.group(1)
263 id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$',
264 webpage, re.MULTILINE|re.DOTALL).group('id')
266 download_webpage = self._download_webpage(download_link, id,
267 'Downloading free downloads page')
268 # We get the dictionary of the track from some javascrip code
269 info = re.search(r'items: (.*?),$',
270 download_webpage, re.MULTILINE).group(1)
271 info = json.loads(info)[0]
272 # We pick mp3-320 for now, until format selection can be easily implemented.
273 mp3_info = info[u'downloads'][u'mp3-320']
274 # If we try to use this url it says the link has expired
275 initial_url = mp3_info[u'url']
276 re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
277 m_url = re.match(re_url, initial_url)
278 #We build the url we will use to get the final track url
279 # This url is build in Bandcamp in the script download_bunde_*.js
280 request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
281 final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
282 # If we could correctly generate the .rand field the url would be
283 #in the "download_url" key
284 final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
286 track_info = {'id':id,
287 'title' : info[u'title'],
290 'thumbnail' : info[u'thumb_url'],
291 'uploader' : info[u'artist']
296 class RedTubeIE(InfoExtractor):
297 """Information Extractor for redtube"""
298 _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
300 def _real_extract(self,url):
301 mobj = re.match(self._VALID_URL, url)
303 raise ExtractorError(u'Invalid URL: %s' % url)
305 video_id = mobj.group('id')
306 video_extension = 'mp4'
307 webpage = self._download_webpage(url, video_id)
309 self.report_extraction(video_id)
311 video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
312 webpage, u'video URL')
314 video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
320 'ext': video_extension,
321 'title': video_title,
324 class InaIE(InfoExtractor):
325 """Information Extractor for Ina.fr"""
326 _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
328 def _real_extract(self,url):
329 mobj = re.match(self._VALID_URL, url)
331 video_id = mobj.group('id')
332 mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id
333 video_extension = 'mp4'
334 webpage = self._download_webpage(mrss_url, video_id)
336 self.report_extraction(video_id)
338 video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
339 webpage, u'video URL')
341 video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
347 'ext': video_extension,
348 'title': video_title,
351 class HowcastIE(InfoExtractor):
352 """Information Extractor for Howcast.com"""
353 _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P<id>\d+)'
355 def _real_extract(self, url):
356 mobj = re.match(self._VALID_URL, url)
358 video_id = mobj.group('id')
359 webpage_url = 'http://www.howcast.com/videos/' + video_id
360 webpage = self._download_webpage(webpage_url, video_id)
362 self.report_extraction(video_id)
364 video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
365 webpage, u'video URL')
367 video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
370 video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
371 webpage, u'description', fatal=False)
373 thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
374 webpage, u'thumbnail', fatal=False)
380 'title': video_title,
381 'description': video_description,
382 'thumbnail': thumbnail,
385 class VineIE(InfoExtractor):
386 """Information Extractor for Vine.co"""
387 _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
389 def _real_extract(self, url):
390 mobj = re.match(self._VALID_URL, url)
392 video_id = mobj.group('id')
393 webpage_url = 'https://vine.co/v/' + video_id
394 webpage = self._download_webpage(webpage_url, video_id)
396 self.report_extraction(video_id)
398 video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
399 webpage, u'video URL')
401 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
404 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
405 webpage, u'thumbnail', fatal=False)
407 uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
408 webpage, u'uploader', fatal=False, flags=re.DOTALL)
414 'title': video_title,
415 'thumbnail': thumbnail,
416 'uploader': uploader,
419 class FlickrIE(InfoExtractor):
420 """Information Extractor for Flickr videos"""
421 _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P<uploader_id>[\w\-_@]+)/(?P<id>\d+).*'
423 def _real_extract(self, url):
424 mobj = re.match(self._VALID_URL, url)
426 video_id = mobj.group('id')
427 video_uploader_id = mobj.group('uploader_id')
428 webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
429 webpage = self._download_webpage(webpage_url, video_id)
431 secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
433 first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
434 first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
436 node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
437 first_xml, u'node_id')
439 second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
440 second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
442 self.report_extraction(video_id)
444 mobj = re.search(r'<STREAM APP="(.+?)" FULLPATH="(.+?)"', second_xml)
446 raise ExtractorError(u'Unable to extract video url')
447 video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
449 video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
450 webpage, u'video title')
452 video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
453 webpage, u'description', fatal=False)
455 thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
456 webpage, u'thumbnail', fatal=False)
462 'title': video_title,
463 'description': video_description,
464 'thumbnail': thumbnail,
465 'uploader_id': video_uploader_id,
468 class TeamcocoIE(InfoExtractor):
469 _VALID_URL = r'http://teamcoco\.com/video/(?P<url_title>.*)'
471 def _real_extract(self, url):
472 mobj = re.match(self._VALID_URL, url)
474 raise ExtractorError(u'Invalid URL: %s' % url)
475 url_title = mobj.group('url_title')
476 webpage = self._download_webpage(url, url_title)
478 video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
479 webpage, u'video id')
481 self.report_extraction(video_id)
483 video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
486 thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
487 webpage, u'thumbnail', fatal=False)
489 video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
490 webpage, u'description', fatal=False)
492 data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
493 data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
495 video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
502 'title': video_title,
503 'thumbnail': thumbnail,
504 'description': video_description,
507 class XHamsterIE(InfoExtractor):
508 """Information Extractor for xHamster"""
509 _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
511 def _real_extract(self,url):
512 mobj = re.match(self._VALID_URL, url)
514 video_id = mobj.group('id')
515 mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
516 webpage = self._download_webpage(mrss_url, video_id)
518 mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
520 raise ExtractorError(u'Unable to extract media URL')
521 if len(mobj.group('server')) == 0:
522 video_url = compat_urllib_parse.unquote(mobj.group('file'))
524 video_url = mobj.group('server')+'/key='+mobj.group('file')
525 video_extension = video_url.split('.')[-1]
527 video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
530 # Can't see the description anywhere in the UI
531 # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
532 # webpage, u'description', fatal=False)
533 # if video_description: video_description = unescapeHTML(video_description)
535 mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
537 video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
539 video_upload_date = None
540 self._downloader.report_warning(u'Unable to extract upload date')
542 video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
543 webpage, u'uploader id', default=u'anonymous')
545 video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
546 webpage, u'thumbnail', fatal=False)
551 'ext': video_extension,
552 'title': video_title,
553 # 'description': video_description,
554 'upload_date': video_upload_date,
555 'uploader_id': video_uploader_id,
556 'thumbnail': video_thumbnail
559 class HypemIE(InfoExtractor):
560 """Information Extractor for hypem"""
561 _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
563 def _real_extract(self, url):
564 mobj = re.match(self._VALID_URL, url)
566 raise ExtractorError(u'Invalid URL: %s' % url)
567 track_id = mobj.group(1)
569 data = { 'ax': 1, 'ts': time.time() }
570 data_encoded = compat_urllib_parse.urlencode(data)
571 complete_url = url + "?" + data_encoded
572 request = compat_urllib_request.Request(complete_url)
573 response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
574 cookie = urlh.headers.get('Set-Cookie', '')
576 self.report_extraction(track_id)
578 html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
579 response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
581 track_list = json.loads(html_tracks)
582 track = track_list[u'tracks'][0]
584 raise ExtractorError(u'Hypemachine contained invalid JSON.')
587 track_id = track[u"id"]
588 artist = track[u"artist"]
589 title = track[u"song"]
591 serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
592 request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
593 request.add_header('cookie', cookie)
594 song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
596 song_data = json.loads(song_data_json)
598 raise ExtractorError(u'Hypemachine contained invalid JSON.')
599 final_url = song_data[u"url"]
609 class Vbox7IE(InfoExtractor):
610 """Information Extractor for Vbox7"""
611 _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
613 def _real_extract(self,url):
614 mobj = re.match(self._VALID_URL, url)
616 raise ExtractorError(u'Invalid URL: %s' % url)
617 video_id = mobj.group(1)
619 redirect_page, urlh = self._download_webpage_handle(url, video_id)
620 new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
621 redirect_url = urlh.geturl() + new_location
622 webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
624 title = self._html_search_regex(r'<title>(.*)</title>',
625 webpage, u'title').split('/')[0].strip()
628 info_url = "http://vbox7.com/play/magare.do"
629 data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
630 info_request = compat_urllib_request.Request(info_url, data)
631 info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
632 info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
633 if info_response is None:
634 raise ExtractorError(u'Unable to extract the media url')
635 (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
642 'thumbnail': thumbnail_url,
646 def gen_extractors():
647 """ Return a list of an instance of every supported extractor.
648 The order does matter; the first extractor matched is the one handling the URL.
676 StanfordOpenClassroomIE(),
716 def get_info_extractor(ie_name):
717 """Returns the info extractor class with the given ie_name"""
718 return globals()[ie_name+'IE']