Merge remote-tracking branch 'sagittarian/vimeo-no-desc'
[youtube-dl] / youtube_dl / InfoExtractors.py
index 8e164760b5d4c5ec6661a1dcd638263faea96c50..0807306609bcde085046b68b8486b5b8f40a1d11 100755 (executable)
@@ -253,11 +253,11 @@ class YoutubeIE(InfoExtractor):
         try:
             sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
+            return (u'unable to download video subtitles: %s' % compat_str(err), None)
         sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
         sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list)
         if not sub_lang_list:
-            return (u'WARNING: video doesn\'t have subtitles', None)
+            return (u'video doesn\'t have subtitles', None)
         return sub_lang_list
 
     def _list_available_subtitles(self, video_id):
@@ -265,6 +265,10 @@ class YoutubeIE(InfoExtractor):
         self.report_video_subtitles_available(video_id, sub_lang_list)
 
     def _request_subtitle(self, sub_lang, sub_name, video_id, format):
+        """
+        Return tuple:
+        (error_message, sub_lang, sub)
+        """
         self.report_video_subtitles_request(video_id, sub_lang, format)
         params = compat_urllib_parse.urlencode({
             'lang': sub_lang,
@@ -276,14 +280,20 @@ class YoutubeIE(InfoExtractor):
         try:
             sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
-            return (u'WARNING: unable to download video subtitles: %s' % compat_str(err), None)
+            return (u'unable to download video subtitles: %s' % compat_str(err), None, None)
         if not sub:
-            return (u'WARNING: Did not fetch video subtitles', None)
+            return (u'Did not fetch video subtitles', None, None)
         return (None, sub_lang, sub)
 
     def _extract_subtitle(self, video_id):
+        """
+        Return a list with a tuple:
+        [(error_message, sub_lang, sub)]
+        """
         sub_lang_list = self._get_available_subtitles(video_id)
         sub_format = self._downloader.params.get('subtitlesformat')
+        if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
+            return [(sub_lang_list[0], None, None)]
         if self._downloader.params.get('subtitleslang', False):
             sub_lang = self._downloader.params.get('subtitleslang')
         elif 'en' in sub_lang_list:
@@ -291,7 +301,7 @@ class YoutubeIE(InfoExtractor):
         else:
             sub_lang = list(sub_lang_list.keys())[0]
         if not sub_lang in sub_lang_list:
-            return (u'WARNING: no closed captions found in the specified language "%s"' % sub_lang, None)
+            return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)]
 
         subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
         return [subtitle]
@@ -299,6 +309,8 @@ class YoutubeIE(InfoExtractor):
     def _extract_all_subtitles(self, video_id):
         sub_lang_list = self._get_available_subtitles(video_id)
         sub_format = self._downloader.params.get('subtitlesformat')
+        if  isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles
+            return [(sub_lang_list[0], None, None)]
         subtitles = []
         for sub_lang in sub_lang_list:
             subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
@@ -532,14 +544,14 @@ class YoutubeIE(InfoExtractor):
             if video_subtitles:
                 (sub_error, sub_lang, sub) = video_subtitles[0]
                 if sub_error:
-                    self._downloader.trouble(sub_error)
+                    self._downloader.report_error(sub_error)
 
         if self._downloader.params.get('allsubtitles', False):
             video_subtitles = self._extract_all_subtitles(video_id)
             for video_subtitle in video_subtitles:
                 (sub_error, sub_lang, sub) = video_subtitle
                 if sub_error:
-                    self._downloader.trouble(sub_error)
+                    self._downloader.report_error(sub_error)
 
         if self._downloader.params.get('listsubtitles', False):
             sub_lang_list = self._list_available_subtitles(video_id)
@@ -1118,7 +1130,7 @@ class VimeoIE(InfoExtractor):
         # Extract video description
         video_description = get_element_by_attribute("itemprop", "description", webpage)
         if video_description: video_description = clean_html(video_description)
-        else: video_description = ''
+        else: video_description = u''
 
         # Extract upload date
         video_upload_date = None
@@ -1710,9 +1722,7 @@ class YoutubePlaylistIE(InfoExtractor):
                         (?:
                            (?:course|view_play_list|my_playlists|artist|playlist|watch)
                            \? (?:.*?&)*? (?:p|a|list)=
-                        |  user/.*?/user/
                         |  p/
-                        |  user/.*?#[pg]/c/
                         )
                         ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
                         .*
@@ -2305,7 +2315,7 @@ class MyVideoIE(InfoExtractor):
         webpage = self._download_webpage(webpage_url, video_id)
 
         self.report_extraction(video_id)
-        mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\' />',
+        mobj = re.search(r'<link rel=\'image_src\' href=\'(http://is[0-9].myvideo\.de/de/movie[0-9]+/[a-f0-9]+)/thumbs/.*?\.jpg\'',
                  webpage)
         if mobj is None:
             self._downloader.report_error(u'unable to extract media URL')
@@ -2802,6 +2812,87 @@ class SoundcloudIE(InfoExtractor):
             'description': info['description'],
         }]
 
+class SoundcloudSetIE(InfoExtractor):
+    """Information extractor for soundcloud.com sets
+       To access the media, the uid of the song and a stream token
+       must be extracted from the page source and the script must make
+       a request to media.soundcloud.com/crossdomain.xml. Then
+       the media can be grabbed by requesting from an url composed
+       of the stream token and uid
+     """
+
+    _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)'
+    IE_NAME = u'soundcloud'
+
+    def __init__(self, downloader=None):
+        InfoExtractor.__init__(self, downloader)
+
+    def report_resolve(self, video_id):
+        """Report information extraction."""
+        self._downloader.to_screen(u'[%s] %s: Resolving id' % (self.IE_NAME, video_id))
+
+    def report_extraction(self, video_id):
+        """Report information extraction."""
+        self._downloader.to_screen(u'[%s] %s: Retrieving stream' % (self.IE_NAME, video_id))
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        if mobj is None:
+            self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
+            return
+
+        # extract uploader (which is in the url)
+        uploader = mobj.group(1)
+        # extract simple title (uploader + slug of song title)
+        slug_title =  mobj.group(2)
+        simple_title = uploader + u'-' + slug_title
+
+        self.report_resolve('%s/sets/%s' % (uploader, slug_title))
+
+        url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
+        resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
+        request = compat_urllib_request.Request(resolv_url)
+        try:
+            info_json_bytes = compat_urllib_request.urlopen(request).read()
+            info_json = info_json_bytes.decode('utf-8')
+        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+            self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err))
+            return
+
+        videos = []
+        info = json.loads(info_json)
+        if 'errors' in info:
+            for err in info['errors']:
+                self._downloader.trouble(u'ERROR: unable to download video webpage: %s' % compat_str(err['error_message']))
+            return
+
+        for track in info['tracks']:
+            video_id = track['id']
+            self.report_extraction('%s/sets/%s' % (uploader, slug_title))
+
+            streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
+            request = compat_urllib_request.Request(streams_url)
+            try:
+                stream_json_bytes = compat_urllib_request.urlopen(request).read()
+                stream_json = stream_json_bytes.decode('utf-8')
+            except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+                self._downloader.trouble(u'ERROR: unable to download stream definitions: %s' % compat_str(err))
+                return
+
+            streams = json.loads(stream_json)
+            mediaURL = streams['http_mp3_128_url']
+
+            videos.append({
+                'id':       video_id,
+                'url':      mediaURL,
+                'uploader': track['user']['username'],
+                'upload_date':  track['created_at'],
+                'title':    track['title'],
+                'ext':      u'mp3',
+                'description': track['description'],
+            })
+        return videos
+
 
 class InfoQIE(InfoExtractor):
     """Information extractor for infoq.com"""
@@ -3604,10 +3695,10 @@ class FunnyOrDieIE(InfoExtractor):
             self._downloader.report_error(u'unable to find video information')
         video_url = unescapeHTML(m.group('url'))
 
-        m = re.search(r"class='player_page_h1'>\s+<a.*?>(?P<title>.*?)</a>", webpage)
+        m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
         if not m:
             self._downloader.trouble(u'Cannot find video title')
-        title = unescapeHTML(m.group('title'))
+        title = clean_html(m.group('title'))
 
         m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
         if m:
@@ -3687,6 +3778,62 @@ class UstreamIE(InfoExtractor):
                   }
         return [info]
 
+class WorldStarHipHopIE(InfoExtractor):
+    _VALID_URL = r'http://(?:www|m)\.worldstar(?:candy|hiphop)\.com/videos/video\.php\?v=(?P<id>.*)'
+    IE_NAME = u'WorldStarHipHop'
+
+    def _real_extract(self, url):
+        _src_url = r"""(http://hw-videos.*(?:mp4|flv))"""
+
+        webpage_src = compat_urllib_request.urlopen(url).read()
+        webpage_src = webpage_src.decode('utf-8')
+
+        mobj = re.search(_src_url, webpage_src)
+
+        m = re.match(self._VALID_URL, url)
+        video_id = m.group('id')
+
+        if mobj is not None:
+            video_url = mobj.group()
+            if 'mp4' in video_url:
+                ext = 'mp4'
+            else:
+                ext = 'flv'
+        else:
+            self._downloader.trouble(u'ERROR: Cannot find video url for %s' % video_id)
+            return
+
+        _title = r"""<title>(.*)</title>"""
+
+        mobj = re.search(_title, webpage_src)
+
+        if mobj is not None:
+            title = mobj.group(1)
+        else:
+            title = 'World Start Hip Hop - %s' % time.ctime()
+
+        _thumbnail = r"""rel="image_src" href="(.*)" />"""
+        mobj = re.search(_thumbnail, webpage_src)
+
+        # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
+        if mobj is not None:
+            thumbnail = mobj.group(1)
+        else:
+            _title = r"""candytitles.*>(.*)</span>"""
+            mobj = re.search(_title, webpage_src)
+            if mobj is not None:
+                title = mobj.group(1)
+            thumbnail = None
+
+        results = [{
+                    'id': video_id,
+                    'url' : video_url,
+                    'title' : title,
+                    'thumbnail' : thumbnail,
+                    'ext' : ext,
+                    }]
+        return results
+
 class RBMARadioIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$'
 
@@ -4051,7 +4198,7 @@ class TEDIE(InfoExtractor):
         videoName=m.group('name')
         webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName)
         # If the url includes the language we get the title translated
-        title_RE=r'<h1><span id="altHeadline" >(?P<title>.*)</span></h1>'
+        title_RE=r'<span id="altHeadline" >(?P<title>.*)</span>'
         title=re.search(title_RE, webpage).group('title')
         info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?)
                         "id":(?P<videoID>[\d]+).*?
@@ -4128,7 +4275,7 @@ class MySpassIE(InfoExtractor):
         return [info]
 
 class SpiegelIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)$'
+    _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$'
 
     def _real_extract(self, url):
         m = re.match(self._VALID_URL, url)
@@ -4160,6 +4307,95 @@ class SpiegelIE(InfoExtractor):
         }
         return [info]
 
+class LiveLeakIE(InfoExtractor):
+
+    _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)'
+    IE_NAME = u'liveleak'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        if mobj is None:
+            self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
+            return
+
+        video_id = mobj.group('video_id')
+
+        webpage = self._download_webpage(url, video_id)
+
+        m = re.search(r'file: "(.*?)",', webpage)
+        if not m:
+            self._downloader.report_error(u'unable to find video url')
+            return
+        video_url = m.group(1)
+
+        m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
+        if not m:
+            self._downloader.trouble(u'Cannot find video title')
+        title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
+
+        m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
+        if m:
+            desc = unescapeHTML(m.group('desc'))
+        else:
+            desc = None
+
+        m = re.search(r'By:.*?(\w+)</a>', webpage)
+        if m:
+            uploader = clean_html(m.group(1))
+        else:
+            uploader = None
+
+        info = {
+            'id':  video_id,
+            'url': video_url,
+            'ext': 'mp4',
+            'title': title,
+            'description': desc,
+            'uploader': uploader
+        }
+
+        return [info]
+
+class ARDIE(InfoExtractor):
+    _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?'
+    _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>'
+    _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)'
+
+    def _real_extract(self, url):
+        # determine video id from url
+        m = re.match(self._VALID_URL, url)
+
+        numid = re.search(r'documentId=([0-9]+)', url)
+        if numid:
+            video_id = numid.group(1)
+        else:
+            video_id = m.group('video_id')
+
+        # determine title and media streams from webpage
+        html = self._download_webpage(url, video_id)
+        title = re.search(self._TITLE, html).group('title')
+        streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
+        if not streams:
+            assert '"fsk"' in html
+            self._downloader.report_error(u'this video is only available after 8:00 pm')
+            return
+
+        # choose default media type and highest quality for now
+        stream = max([s for s in streams if int(s["media_type"]) == 0],
+                     key=lambda s: int(s["quality"]))
+
+        # there's two possibilities: RTMP stream or HTTP download
+        info = {'id': video_id, 'title': title, 'ext': 'mp4'}
+        if stream['rtmp_url']:
+            self._downloader.to_screen(u'[%s] RTMP download detected' % self.IE_NAME)
+            assert stream['video_url'].startswith('mp4:')
+            info["url"] = stream["rtmp_url"]
+            info["play_path"] = stream['video_url']
+        else:
+            assert stream["video_url"].endswith('.mp4')
+            info["url"] = stream["video_url"]
+        return [info]
+
 
 def gen_extractors():
     """ Return a list of an instance of every supported extractor.
@@ -4187,6 +4423,7 @@ def gen_extractors():
         EscapistIE(),
         CollegeHumorIE(),
         XVideosIE(),
+        SoundcloudSetIE(),
         SoundcloudIE(),
         InfoQIE(),
         MixcloudIE(),
@@ -4200,6 +4437,7 @@ def gen_extractors():
         GooglePlusIE(),
         ArteTvIE(),
         NBAIE(),
+        WorldStarHipHopIE(),
         JustinTVIE(),
         FunnyOrDieIE(),
         SteamIE(),
@@ -4210,7 +4448,7 @@ def gen_extractors():
         TEDIE(),
         MySpassIE(),
         SpiegelIE(),
+        LiveLeakIE(),
+        ARDIE(),
         GenericIE()
     ]
-
-