Merge pull request #853 from mc2avr/master

[youtube-dl] / youtube_dl / InfoExtractors.py
diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py

index b40edf5fbb92f6cf86fc7f96b3d99346b61523a6..24a77a1abba247c49234189d794d7da6cc83486f 100755 (executable)
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -4091,6 +4091,64 @@ class ARDIE(InfoExtractor):
              info["url"] = stream["video_url"]
          return [info]
  
+class ZDFIE(InfoExtractor):
+    _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
+    _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
+    _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
+    _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
+    _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        if mobj is None:
+            raise ExtractorError(u'Invalid URL: %s' % url)
+        video_id = mobj.group('video_id')
+
+        html = self._download_webpage(url, video_id)
+        streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
+        if streams is None:
+            raise ExtractorError(u'No media url found.')
+
+        # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
+        # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
+        # choose first/default media type and highest quality for now
+        for s in streams:        #find 300 - dsl1000mbit
+            if s['quality'] == '300' and s['media_type'] == 'wstreaming':
+                stream_=s
+                break
+        for s in streams:        #find veryhigh - dsl2000mbit
+            if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
+                stream_=s
+                break
+        if stream_ is None:
+            raise ExtractorError(u'No stream found.')
+
+        media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
+
+        self.report_extraction(video_id)
+        mobj = re.search(self._TITLE, html)
+        if mobj is None:
+            raise ExtractorError(u'Cannot extract title')
+        title = unescapeHTML(mobj.group('title'))
+
+        mobj = re.search(self._MMS_STREAM, media_link)
+        if mobj is None:
+            mobj = re.search(self._RTSP_STREAM, media_link)
+            if mobj is None:
+                raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
+        mms_url = mobj.group('video_url')
+
+        mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
+        if mobj is None:
+            raise ExtractorError(u'Cannot extract extention')
+        ext = mobj.group('ext')
+
+        return [{'id': video_id,
+                 'url': mms_url,
+                 'title': title,
+                 'ext': ext
+                 }]
+
  class TumblrIE(InfoExtractor):
      _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
  
@@ -4534,6 +4592,40 @@ class HypemIE(InfoExtractor):
              'artist':   artist,
          }]
  
+class Vbox7IE(InfoExtractor):
+    """Information Extractor for Vbox7"""
+    _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
+
+    def _real_extract(self,url):
+        mobj = re.match(self._VALID_URL, url)
+        if mobj is None:
+            raise ExtractorError(u'Invalid URL: %s' % url)
+        video_id = mobj.group(1)
+
+        redirect_page, urlh = self._download_webpage_handle(url, video_id)
+        redirect_url = urlh.geturl() + re.search(r'window\.location = \'(.*)\';', redirect_page).group(1)
+        webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
+
+        title = re.search(r'<title>(.*)</title>', webpage)
+        title = (title.group(1)).split('/')[0].strip()
+
+        ext = "flv"
+        info_url = "http://vbox7.com/play/magare.do"
+        data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
+        info_request = compat_urllib_request.Request(info_url, data)
+        info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+        info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
+        if info_response is None:
+            raise ExtractorError(u'Unable to extract the media url')
+        (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
+
+        return [{
+            'id':        video_id,
+            'url':       final_url,
+            'ext':       ext,
+            'title':     title,
+            'thumbnail': thumbnail_url,
+        }]
  
  def gen_extractors():
      """ Return a list of an instance of every supported extractor.
@@ -4588,6 +4680,7 @@ def gen_extractors():
          SpiegelIE(),
          LiveLeakIE(),
          ARDIE(),
+        ZDFIE(),
          TumblrIE(),
          BandcampIE(),
          RedTubeIE(),
@@ -4598,6 +4691,7 @@ def gen_extractors():
          TeamcocoIE(),
          XHamsterIE(),
          HypemIE(),
+        Vbox7IE(),
          GenericIE()
      ]