Merge pull request #853 from mc2avr/master
[youtube-dl] / youtube_dl / InfoExtractors.py
index 5811ef0da634358ff41cd9b70492522796a34db4..24a77a1abba247c49234189d794d7da6cc83486f 100755 (executable)
@@ -1099,7 +1099,7 @@ class VimeoIE(InfoExtractor):
 
         # Extract uploader and uploader_id
         video_uploader = config["video"]["owner"]["name"]
-        video_uploader_id = config["video"]["owner"]["url"].split('/')[-1]
+        video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] if config["video"]["owner"]["url"] else None
 
         # Extract video thumbnail
         video_thumbnail = config["video"]["thumbnail"]
@@ -3980,7 +3980,7 @@ class SpiegelIE(InfoExtractor):
         video_id = m.group('videoID')
 
         webpage = self._download_webpage(url, video_id)
-        m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage)
+        m = re.search(r'<div class="module-title">(.*?)</div>', webpage)
         if not m:
             raise ExtractorError(u'Cannot find title')
         video_title = unescapeHTML(m.group(1))
@@ -4091,6 +4091,64 @@ class ARDIE(InfoExtractor):
             info["url"] = stream["video_url"]
         return [info]
 
+class ZDFIE(InfoExtractor):
+    _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
+    _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
+    _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
+    _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
+    _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        if mobj is None:
+            raise ExtractorError(u'Invalid URL: %s' % url)
+        video_id = mobj.group('video_id')
+
+        html = self._download_webpage(url, video_id)
+        streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
+        if streams is None:
+            raise ExtractorError(u'No media url found.')
+
+        # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
+        # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
+        # choose first/default media type and highest quality for now
+        for s in streams:        #find 300 - dsl1000mbit
+            if s['quality'] == '300' and s['media_type'] == 'wstreaming':
+                stream_=s
+                break
+        for s in streams:        #find veryhigh - dsl2000mbit
+            if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
+                stream_=s
+                break
+        if stream_ is None:
+            raise ExtractorError(u'No stream found.')
+
+        media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
+
+        self.report_extraction(video_id)
+        mobj = re.search(self._TITLE, html)
+        if mobj is None:
+            raise ExtractorError(u'Cannot extract title')
+        title = unescapeHTML(mobj.group('title'))
+
+        mobj = re.search(self._MMS_STREAM, media_link)
+        if mobj is None:
+            mobj = re.search(self._RTSP_STREAM, media_link)
+            if mobj is None:
+                raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
+        mms_url = mobj.group('video_url')
+
+        mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
+        if mobj is None:
+            raise ExtractorError(u'Cannot extract extention')
+        ext = mobj.group('ext')
+
+        return [{'id': video_id,
+                 'url': mms_url,
+                 'title': title,
+                 'ext': ext
+                 }]
+
 class TumblrIE(InfoExtractor):
     _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
 
@@ -4483,6 +4541,92 @@ class XHamsterIE(InfoExtractor):
             'thumbnail': video_thumbnail
         }]
 
+class HypemIE(InfoExtractor):
+    """Information Extractor for hypem"""
+    _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        if mobj is None:
+            raise ExtractorError(u'Invalid URL: %s' % url)
+        track_id = mobj.group(1)
+
+        data = { 'ax': 1, 'ts': time.time() }
+        data_encoded = compat_urllib_parse.urlencode(data)
+        complete_url = url + "?" + data_encoded
+        request = compat_urllib_request.Request(complete_url)
+        response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url')
+        cookie = urlh.headers.get('Set-Cookie', '')
+
+        self.report_extraction(track_id)
+        mobj = re.search(r'<script type="application/json" id="displayList-data">(.*?)</script>', response, flags=re.MULTILINE|re.DOTALL)
+        if mobj is None:
+            raise ExtractorError(u'Unable to extrack tracks')
+        html_tracks = mobj.group(1).strip()
+        try:
+            track_list = json.loads(html_tracks)
+            track = track_list[u'tracks'][0]
+        except ValueError:
+            raise ExtractorError(u'Hypemachine contained invalid JSON.')
+
+        key = track[u"key"]
+        track_id = track[u"id"]
+        artist = track[u"artist"]
+        title = track[u"song"]
+
+        serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key))
+        request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'})
+        request.add_header('cookie', cookie)
+        song_data_json = self._download_webpage(request, track_id, u'Downloading metadata')
+        try:
+            song_data = json.loads(song_data_json)
+        except ValueError:
+            raise ExtractorError(u'Hypemachine contained invalid JSON.')
+        final_url = song_data[u"url"]
+
+        return [{
+            'id':       track_id,
+            'url':      final_url,
+            'ext':      "mp3",
+            'title':    title,
+            'artist':   artist,
+        }]
+
+class Vbox7IE(InfoExtractor):
+    """Information Extractor for Vbox7"""
+    _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
+
+    def _real_extract(self,url):
+        mobj = re.match(self._VALID_URL, url)
+        if mobj is None:
+            raise ExtractorError(u'Invalid URL: %s' % url)
+        video_id = mobj.group(1)
+
+        redirect_page, urlh = self._download_webpage_handle(url, video_id)
+        redirect_url = urlh.geturl() + re.search(r'window\.location = \'(.*)\';', redirect_page).group(1)
+        webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
+
+        title = re.search(r'<title>(.*)</title>', webpage)
+        title = (title.group(1)).split('/')[0].strip()
+
+        ext = "flv"
+        info_url = "http://vbox7.com/play/magare.do"
+        data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
+        info_request = compat_urllib_request.Request(info_url, data)
+        info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+        info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
+        if info_response is None:
+            raise ExtractorError(u'Unable to extract the media url')
+        (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
+
+        return [{
+            'id':        video_id,
+            'url':       final_url,
+            'ext':       ext,
+            'title':     title,
+            'thumbnail': thumbnail_url,
+        }]
+
 def gen_extractors():
     """ Return a list of an instance of every supported extractor.
     The order does matter; the first extractor matched is the one handling the URL.
@@ -4536,6 +4680,7 @@ def gen_extractors():
         SpiegelIE(),
         LiveLeakIE(),
         ARDIE(),
+        ZDFIE(),
         TumblrIE(),
         BandcampIE(),
         RedTubeIE(),
@@ -4545,6 +4690,8 @@ def gen_extractors():
         FlickrIE(),
         TeamcocoIE(),
         XHamsterIE(),
+        HypemIE(),
+        Vbox7IE(),
         GenericIE()
     ]