Merge pull request #730 by @JohnyMoSwag
[youtube-dl] / youtube_dl / InfoExtractors.py
index 8e164760b5d4c5ec6661a1dcd638263faea96c50..b4c86cfa311b8391c40bf6e5f0f179724fae7057 100755 (executable)
@@ -2305,7 +2305,7 @@ class MyVideoIE(InfoExtractor):
         webpage = self._download_webpage(webpage_url, video_id)
 
         self.report_extraction(video_id)
-        mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
+        mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
                  webpage)
         if mobj is None:
             self._downloader.report_error(u'unable to extract media URL')
@@ -3604,10 +3604,10 @@ class FunnyOrDieIE(InfoExtractor):
             self._downloader.report_error(u'unable to find video information')
         video_url = unescapeHTML(m.group('url'))
 
-        m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
+        m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
         if not m:
             self._downloader.trouble(u'Cannot find video title')
-        title = unescapeHTML(m.group('title'))
+        title = clean_html(m.group('title'))
 
         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
         if m:
@@ -3687,6 +3687,62 @@ class UstreamIE(InfoExtractor):
                   }
         return [info]
 
+class WorldStarHipHopIE(InfoExtractor):
+    _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
+    IE_NAME = u'WorldStarHipHop'
+
+    def _real_extract(self, url):
+        _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
+
+        webpage_src = compat_urllib_request.urlopen(url).read()
+        webpage_src = webpage_src.decode('utf-8')
+
+        mobj = re.search(_src_url, webpage_src)
+
+        m = re.match(self._VALID_URL, url)
+        video_id = m.group('id')
+
+        if mobj is not None:
+            video_url = mobj.group()
+            if 'mp4' in video_url:
+                ext = 'mp4'
+            else:
+                ext = 'flv'
+        else:
+            self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
+            return
+
+        _title = r"""<title>(.*)</title>"""
+
+        mobj = re.search(_title, webpage_src)
+        
+        if mobj is not None:
+            title = mobj.group(1)
+        else:
+            title = 'World Start Hip Hop - %s' % time.ctime()
+
+        _thumbnail = r"""rel="image_src" href="(.*)" />"""
+        mobj = re.search(_thumbnail, webpage_src)
+
+        # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
+        if mobj is not None:
+            thumbnail = mobj.group(1)
+        else:
+            _title = r"""candytitles.*>(.*)</span>"""
+            mobj = re.search(_title, webpage_src)
+            if mobj is not None:
+                title = mobj.group(1)
+            thumbnail = None
+        
+        results = [{
+                    'id': video_id,
+                    'url' : video_url,
+                    'title' : title,
+                    'thumbnail' : thumbnail,
+                    'ext' : ext,
+                    }]
+        return results
+
 class RBMARadioIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
 
@@ -4051,7 +4107,7 @@ class TEDIE(InfoExtractor):
         videoName=m.group('name')
         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
         # If the url includes the language we get the title translated
-        title_RE=r'<h1><span id="altHeadline" >(?P<title>.*)</span></h1>'
+        title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
         title=re.search(title_RE, webpage).group('title')
         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
                         "id":(?P<videoID>[\d]+).*?
@@ -4128,7 +4184,7 @@ class MySpassIE(InfoExtractor):
         return [info]
 
 class SpiegelIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)$'
+    _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
 
     def _real_extract(self, url):
         m = re.match(self._VALID_URL, url)
@@ -4160,6 +4216,55 @@ class SpiegelIE(InfoExtractor):
         }
         return [info]
 
+class LiveLeakIE(InfoExtractor):
+
+    _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
+    IE_NAME = u'liveleak'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        if mobj is None:
+            self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
+            return
+
+        video_id = mobj.group('video_id')
+
+        webpage = self._download_webpage(url, video_id)
+
+        m = re.search(r'file: "(.*?)",', webpage)
+        if not m:
+            self._downloader.report_error(u'unable to find video url')
+            return
+        video_url = m.group(1)
+
+        m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
+        if not m:
+            self._downloader.trouble(u'Cannot find video title')
+        title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
+
+        m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
+        if m:
+            desc = unescapeHTML(m.group('desc'))
+        else:
+            desc = None
+
+        m = re.search(r'By:.*?(\w+)</a>', webpage)
+        if m:
+            uploader = clean_html(m.group(1))
+        else:
+            uploader = None
+
+        info = {
+            'id':  video_id,
+            'url': video_url,
+            'ext': 'mp4',
+            'title': title,
+            'description': desc,
+            'uploader': uploader
+        }
+
+        return [info]
+
 
 def gen_extractors():
     """ Return a list of an instance of every supported extractor.
@@ -4200,6 +4305,7 @@ def gen_extractors():
         GooglePlusIE(),
         ArteTvIE(),
         NBAIE(),
+        WorldStarHipHopIE(),
         JustinTVIE(),
         FunnyOrDieIE(),
         SteamIE(),
@@ -4210,7 +4316,6 @@ def gen_extractors():
         TEDIE(),
         MySpassIE(),
         SpiegelIE(),
+        LiveLeakIE(),
         GenericIE()
     ]
-
-