[redtube] move into own file
[youtube-dl] / youtube_dl / InfoExtractors.py
index 4002c9485d394b75d89be4f5a444d9f40c421a4d..c06ecbe52aad148ce036cd98c74ef1f371b27c78 100755 (executable)
@@ -20,6 +20,7 @@ from .extractor.common import InfoExtractor, SearchInfoExtractor
 
 from .extractor.ard import ARDIE
 from .extractor.arte import ArteTvIE
+from .extractor.bandcamp import BandcampIE
 from .extractor.bliptv import BlipTVIE, BlipTVUserIE
 from .extractor.comedycentral import ComedyCentralIE
 from .extractor.collegehumor import CollegeHumorIE
@@ -35,21 +36,29 @@ from .extractor.googleplus import GooglePlusIE
 from .extractor.googlesearch import GoogleSearchIE
 from .extractor.infoq import InfoQIE
 from .extractor.justintv import JustinTVIE
+from .extractor.keek import KeekIE
+from .extractor.liveleak import LiveLeakIE
 from .extractor.metacafe import MetacafeIE
 from .extractor.mixcloud import MixcloudIE
 from .extractor.mtv import MTVIE
+from .extractor.myspass import MySpassIE
 from .extractor.myvideo import MyVideoIE
 from .extractor.nba import NBAIE
 from .extractor.statigram import StatigramIE
 from .extractor.photobucket import PhotobucketIE
 from .extractor.pornotube import PornotubeIE
 from .extractor.rbmaradio import RBMARadioIE
+from .extractor.redtube import RedTubeIE
 from .extractor.soundcloud import SoundcloudIE, SoundcloudSetIE
+from .extractor.spiegel import SpiegelIE
 from .extractor.stanfordoc import StanfordOpenClassroomIE
 from .extractor.steam import SteamIE
 from .extractor.ted import TEDIE
+from .extractor.tumblr import TumblrIE
 from .extractor.ustream import UstreamIE
+from .extractor.vbox7 import Vbox7IE
 from .extractor.vimeo import VimeoIE
+from .extractor.vine import VineIE
 from .extractor.worldstarhiphop import WorldStarHipHopIE
 from .extractor.xnxx import XNXXIE
 from .extractor.xvideos import XVideosIE
@@ -85,268 +94,15 @@ from .extractor.zdf import ZDFIE
 
 
 
-class KeekIE(InfoExtractor):
-    _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
-    IE_NAME = u'keek'
 
-    def _real_extract(self, url):
-        m = re.match(self._VALID_URL, url)
-        video_id = m.group('videoID')
-
-        video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
-        thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
-        webpage = self._download_webpage(url, video_id)
-
-        video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
-            webpage, u'title')
-
-        uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
-            webpage, u'uploader', fatal=False)
-
-        info = {
-                'id': video_id,
-                'url': video_url,
-                'ext': 'mp4',
-                'title': video_title,
-                'thumbnail': thumbnail,
-                'uploader': uploader
-        }
-        return [info]
 
 
-class MySpassIE(InfoExtractor):
-    _VALID_URL = r'http://www.myspass.de/.*'
 
-    def _real_extract(self, url):
-        META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s'
-
-        # video id is the last path element of the URL
-        # usually there is a trailing slash, so also try the second but last
-        url_path = compat_urllib_parse_urlparse(url).path
-        url_parent_path, video_id = os.path.split(url_path)
-        if not video_id:
-            _, video_id = os.path.split(url_parent_path)
-
-        # get metadata
-        metadata_url = META_DATA_URL_TEMPLATE % video_id
-        metadata_text = self._download_webpage(metadata_url, video_id)
-        metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8'))
-
-        # extract values from metadata
-        url_flv_el = metadata.find('url_flv')
-        if url_flv_el is None:
-            raise ExtractorError(u'Unable to extract download url')
-        video_url = url_flv_el.text
-        extension = os.path.splitext(video_url)[1][1:]
-        title_el = metadata.find('title')
-        if title_el is None:
-            raise ExtractorError(u'Unable to extract title')
-        title = title_el.text
-        format_id_el = metadata.find('format_id')
-        if format_id_el is None:
-            format = ext
-        else:
-            format = format_id_el.text
-        description_el = metadata.find('description')
-        if description_el is not None:
-            description = description_el.text
-        else:
-            description = None
-        imagePreview_el = metadata.find('imagePreview')
-        if imagePreview_el is not None:
-            thumbnail = imagePreview_el.text
-        else:
-            thumbnail = None
-        info = {
-            'id': video_id,
-            'url': video_url,
-            'title': title,
-            'ext': extension,
-            'format': format,
-            'thumbnail': thumbnail,
-            'description': description
-        }
-        return [info]
 
-class SpiegelIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
 
-    def _real_extract(self, url):
-        m = re.match(self._VALID_URL, url)
-        video_id = m.group('videoID')
 
-        webpage = self._download_webpage(url, video_id)
 
-        video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
-            webpage, u'title')
-
-        xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
-        xml_code = self._download_webpage(xml_url, video_id,
-                    note=u'Downloading XML', errnote=u'Failed to download XML')
-
-        idoc = xml.etree.ElementTree.fromstring(xml_code)
-        last_type = idoc[-1]
-        filename = last_type.findall('./filename')[0].text
-        duration = float(last_type.findall('./duration')[0].text)
-
-        video_url = 'http://video2.spiegel.de/flash/' + filename
-        video_ext = filename.rpartition('.')[2]
-        info = {
-            'id': video_id,
-            'url': video_url,
-            'ext': video_ext,
-            'title': video_title,
-            'duration': duration,
-        }
-        return [info]
-
-class LiveLeakIE(InfoExtractor):
-
-    _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
-    IE_NAME = u'liveleak'
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        if mobj is None:
-            raise ExtractorError(u'Invalid URL: %s' % url)
 
-        video_id = mobj.group('video_id')
-
-        webpage = self._download_webpage(url, video_id)
-
-        video_url = self._search_regex(r'file: "(.*?)",',
-            webpage, u'video URL')
-
-        video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
-            webpage, u'title').replace('LiveLeak.com -', '').strip()
-
-        video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
-            webpage, u'description', fatal=False)
-
-        video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
-            webpage, u'uploader', fatal=False)
-
-        info = {
-            'id':  video_id,
-            'url': video_url,
-            'ext': 'mp4',
-            'title': video_title,
-            'description': video_description,
-            'uploader': video_uploader
-        }
-
-        return [info]
-
-
-
-class TumblrIE(InfoExtractor):
-    _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
-
-    def _real_extract(self, url):
-        m_url = re.match(self._VALID_URL, url)
-        video_id = m_url.group('id')
-        blog = m_url.group('blog_name')
-
-        url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id)
-        webpage = self._download_webpage(url, video_id)
-
-        re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
-        video = re.search(re_video, webpage)
-        if video is None:
-           raise ExtractorError(u'Unable to extract video')
-        video_url = video.group('video_url')
-        ext = video.group('ext')
-
-        video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
-            webpage, u'thumbnail', fatal=False)  # We pick the first poster
-        if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
-
-        # The only place where you can get a title, it's not complete,
-        # but searching in other places doesn't work for all videos
-        video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
-            webpage, u'title', flags=re.DOTALL)
-
-        return [{'id': video_id,
-                 'url': video_url,
-                 'title': video_title,
-                 'thumbnail': video_thumbnail,
-                 'ext': ext
-                 }]
-
-class BandcampIE(InfoExtractor):
-    _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P<title>.*)'
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        title = mobj.group('title')
-        webpage = self._download_webpage(url, title)
-        # We get the link to the free download page
-        m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
-        if m_download is None:
-            raise ExtractorError(u'No free songs found')
-
-        download_link = m_download.group(1)
-        id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
-                       webpage, re.MULTILINE|re.DOTALL).group('id')
-
-        download_webpage = self._download_webpage(download_link, id,
-                                                  'Downloading free downloads page')
-        # We get the dictionary of the track from some javascrip code
-        info = re.search(r'items: (.*?),$',
-                         download_webpage, re.MULTILINE).group(1)
-        info = json.loads(info)[0]
-        # We pick mp3-320 for now, until format selection can be easily implemented.
-        mp3_info = info[u'downloads'][u'mp3-320']
-        # If we try to use this url it says the link has expired
-        initial_url = mp3_info[u'url']
-        re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$'
-        m_url = re.match(re_url, initial_url)
-        #We build the url we will use to get the final track url
-        # This url is build in Bandcamp in the script download_bunde_*.js
-        request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts'))
-        final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url')
-        # If we could correctly generate the .rand field the url would be
-        #in the "download_url" key
-        final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1)
-
-        track_info = {'id':id,
-                      'title' : info[u'title'],
-                      'ext' :   'mp3',
-                      'url' :   final_url,
-                      'thumbnail' : info[u'thumb_url'],
-                      'uploader' :  info[u'artist']
-                      }
-
-        return [track_info]
-
-class RedTubeIE(InfoExtractor):
-    """Information Extractor for redtube"""
-    _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
-
-    def _real_extract(self,url):
-        mobj = re.match(self._VALID_URL, url)
-        if mobj is None:
-            raise ExtractorError(u'Invalid URL: %s' % url)
-
-        video_id = mobj.group('id')
-        video_extension = 'mp4'        
-        webpage = self._download_webpage(url, video_id)
-
-        self.report_extraction(video_id)
-
-        video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
-            webpage, u'video URL')
-
-        video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
-            webpage, u'title')
-
-        return [{
-            'id':       video_id,
-            'url':      video_url,
-            'ext':      video_extension,
-            'title':    video_title,
-        }]
-        
 class InaIE(InfoExtractor):
     """Information Extractor for Ina.fr"""
     _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
@@ -408,39 +164,6 @@ class HowcastIE(InfoExtractor):
             'thumbnail': thumbnail,
         }]
 
-class VineIE(InfoExtractor):
-    """Information Extractor for Vine.co"""
-    _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
-
-    def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-
-        video_id = mobj.group('id')
-        webpage_url = 'https://vine.co/v/' + video_id
-        webpage = self._download_webpage(webpage_url, video_id)
-
-        self.report_extraction(video_id)
-
-        video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
-            webpage, u'video URL')
-
-        video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
-            webpage, u'title')
-
-        thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
-            webpage, u'thumbnail', fatal=False)
-
-        uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
-            webpage, u'uploader', fatal=False, flags=re.DOTALL)
-
-        return [{
-            'id':        video_id,
-            'url':       video_url,
-            'ext':       'mp4',
-            'title':     video_title,
-            'thumbnail': thumbnail,
-            'uploader':  uploader,
-        }]
 
 class FlickrIE(InfoExtractor):
     """Information Extractor for Flickr videos"""
@@ -632,41 +355,6 @@ class HypemIE(InfoExtractor):
             'artist':   artist,
         }]
 
-class Vbox7IE(InfoExtractor):
-    """Information Extractor for Vbox7"""
-    _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
-
-    def _real_extract(self,url):
-        mobj = re.match(self._VALID_URL, url)
-        if mobj is None:
-            raise ExtractorError(u'Invalid URL: %s' % url)
-        video_id = mobj.group(1)
-
-        redirect_page, urlh = self._download_webpage_handle(url, video_id)
-        new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
-        redirect_url = urlh.geturl() + new_location
-        webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
-
-        title = self._html_search_regex(r'<title>(.*)</title>',
-            webpage, u'title').split('/')[0].strip()
-
-        ext = "flv"
-        info_url = "http://vbox7.com/play/magare.do"
-        data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
-        info_request = compat_urllib_request.Request(info_url, data)
-        info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
-        info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
-        if info_response is None:
-            raise ExtractorError(u'Unable to extract the media url')
-        (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
-
-        return [{
-            'id':        video_id,
-            'url':       final_url,
-            'ext':       ext,
-            'title':     title,
-            'thumbnail': thumbnail_url,
-        }]
 
 
 def gen_extractors():