Merge pull request #887 from anisse/master

author Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>

Wed, 19 Jun 2013 10:51:26 +0000 (12:51 +0200)

committer Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>

Wed, 19 Jun 2013 10:52:44 +0000 (12:52 +0200)
author Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
Wed, 19 Jun 2013 10:51:26 +0000 (12:51 +0200)
committer Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
Wed, 19 Jun 2013 10:52:44 +0000 (12:52 +0200)
diff --combined test/test_youtube_lists.py

index b842e6cc160b2a1cc06f5e6c71ba103fd9a849bc,78657b51ca43b1776596e612266cb0e8d9a07ebd..e8b49ff8ebe3bd3f74f796888cdc7f27b1ecc386
--- 1/test/test_youtube_lists.py
--- 2/test/test_youtube_lists.py
+++ b/test/test_youtube_lists.py
@@@ -53,7 -53,8 +53,7 @@@ class TestYoutubeLists(unittest.TestCas
           dl = FakeDownloader()
           ie = YoutubePlaylistIE(dl)
           result = ie.extract('PLBB231211A4F62143')[0]
- -        self.assertEqual(result['title'], 'Team Fortress 2')
- -        self.assertTrue(len(result['entries']) > 40)
+ +        self.assertTrue(len(result['entries']) > 25)
   
       def test_youtube_playlist_long(self):
           dl = FakeDownloader()
@@@ -104,5 -105,5 +104,11 @@@
           result = ie.extract('https://www.youtube.com/user/TheLinuxFoundation')[0]
           self.assertTrue(len(result['entries']) >= 320)
   
++    def test_youtube_safe_search(self):
++        dl = FakeDownloader()
++        ie = YoutubePlaylistIE(dl)
++        result = ie.extract('PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl')[0]
++        self.assertEqual(len(result['entries']), 2)
++
   if __name__ == '__main__':
       unittest.main()
diff --combined youtube_dl/InfoExtractors.py

index f36503d218c6761bf1a243007a8605c2a4bb37c3,39278a2e97e6471fe8f8ea72f4331884969da09c..db089403ffa7b2750ceeee6438af079c250f3680
--- 1/youtube_dl/InfoExtractors.py
--- 2/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@@ -191,47 -191,6 +191,47 @@@ class InfoExtractor(object)
               video_info['title'] = playlist_title
           return video_info
   
+ +    def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
+ +        """
+ +        Perform a regex search on the given string, using a single or a list of
+ +        patterns returning the first matching group.
+ +        In case of failure return a default value or raise a WARNING or a
+ +        ExtractorError, depending on fatal, specifying the field name.
+ +        """
+ +        if isinstance(pattern, (str, compat_str, compiled_regex_type)):
+ +            mobj = re.search(pattern, string, flags)
+ +        else:
+ +            for p in pattern:
+ +                mobj = re.search(p, string, flags)
+ +                if mobj: break
+ +
+ +        if sys.stderr.isatty() and os.name != 'nt':
+ +            _name = u'\033[0;34m%s\033[0m' % name
+ +        else:
+ +            _name = name
+ +
+ +        if mobj:
+ +            # return the first matching group
+ +            return next(g for g in mobj.groups() if g is not None)
+ +        elif default is not None:
+ +            return default
+ +        elif fatal:
+ +            raise ExtractorError(u'Unable to extract %s' % _name)
+ +        else:
+ +            self._downloader.report_warning(u'unable to extract %s; '
+ +                u'please report this issue on GitHub.' % _name)
+ +            return None
+ +
+ +    def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
+ +        """
+ +        Like _search_regex, but strips HTML tags and unescapes entities.
+ +        """
+ +        res = self._search_regex(pattern, string, name, default, fatal, flags)
+ +        if res:
+ +            return clean_html(res).strip()
+ +        else:
+ +            return res
+ +
   class SearchInfoExtractor(InfoExtractor):
       """
       Base class for paged search queries extractors.
@@@ -1005,13 -964,18 +1005,13 @@@ class PhotobucketIE(InfoExtractor)
               }]
   
           # We try looking in other parts of the webpage
- -        mobj = re.search(r'<link rel="video_src" href=".*\?file=([^"]+)" />', webpage)
- -        if mobj is None:
- -            raise ExtractorError(u'Unable to extract media URL')
- -        mediaURL = compat_urllib_parse.unquote(mobj.group(1))
- -
- -        video_url = mediaURL
+ +        video_url = self._search_regex(r'<link rel="video_src" href=".*\?file=([^"]+)" />',
+ +            webpage, u'video URL')
   
           mobj = re.search(r'<title>(.*) video by (.*) - Photobucket</title>', webpage)
           if mobj is None:
               raise ExtractorError(u'Unable to extract title')
           video_title = mobj.group(1).decode('utf-8')
- -
           video_uploader = mobj.group(2).decode('utf-8')
   
           return [{
@@@ -1433,12 -1397,16 +1433,12 @@@ class GenericIE(InfoExtractor)
           #   Site Name | Video Title
           #   Video Title - Tagline | Site Name
           # and so on and so forth; it's just not practical
- -        mobj = re.search(r'<title>(.*)</title>', webpage)
- -        if mobj is None:
- -            raise ExtractorError(u'Unable to extract title')
- -        video_title = mobj.group(1)
+ +        video_title = self._html_search_regex(r'<title>(.*)</title>',
+ +            webpage, u'video title')
   
           # video uploader is domain name
- -        mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
- -        if mobj is None:
- -            raise ExtractorError(u'Unable to extract title')
- -        video_uploader = mobj.group(1)
+ +        video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
+ +            url, u'video uploader')
   
           return [{
               'id':       video_id,
@@@ -1576,7 -1544,7 +1576,7 @@@ class YoutubePlaylistIE(InfoExtractor)
                        |
                           ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
                        )"""
-     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json'
+     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
       _MAX_RESULTS = 50
       IE_NAME = u'youtube:playlist'
   
@@@ -1837,7 -1805,10 +1837,7 @@@ class DepositFilesIE(InfoExtractor)
           file_extension = os.path.splitext(file_url)[1][1:]
   
           # Search for file title
- -        mobj = re.search(r'<b title="(.*?)">', webpage)
- -        if mobj is None:
- -            raise ExtractorError(u'Unable to extract title')
- -        file_title = mobj.group(1).decode('utf-8')
+ +        file_title = self._search_regex(r'<b title="(.*?)">', webpage, u'title')
   
           return [{
               'id':       file_id.decode('utf-8'),
@@@ -1931,8 -1902,10 +1931,8 @@@ class FacebookIE(InfoExtractor)
           video_duration = int(video_data['video_duration'])
           thumbnail = video_data['thumbnail_src']
   
- -        m = re.search('<h2 class="uiHeaderTitle">([^<]+)</h2>', webpage)
- -        if not m:
- -            raise ExtractorError(u'Cannot find title in webpage')
- -        video_title = unescapeHTML(m.group(1))
+ +        video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
+ +            webpage, u'title')
   
           info = {
               'id': video_id,
@@@ -2094,10 -2067,15 +2094,10 @@@ class MyVideoIE(InfoExtractor)
               self.report_extraction(video_id)
               video_url = mobj.group(1) + '.flv'
   
- -            mobj = re.search('<title>([^<]+)</title>', webpage)
- -            if mobj is None:
- -                raise ExtractorError(u'Unable to extract title')
- -            video_title = mobj.group(1)
+ +            video_title = self._html_search_regex('<title>([^<]+)</title>',
+ +                webpage, u'title')
   
- -            mobj = re.search('[.](.+?)$', video_url)
- -            if mobj is None:
- -                raise ExtractorError(u'Unable to extract extention')
- -            video_ext = mobj.group(1)
+ +            video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
   
               return [{
                   'id':       video_id,
@@@ -2145,23 -2123,25 +2145,23 @@@
           # extracting infos
           self.report_extraction(video_id)
   
+ +        video_url = None
           mobj = re.search('connectionurl=\'(.*?)\'', dec_data)
- -        if mobj is None:
- -            raise ExtractorError(u'unable to extract rtmpurl')
- -        video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1))
- -        if 'myvideo2flash' in video_rtmpurl:
- -            self._downloader.report_warning(u'forcing RTMPT ...')
- -            video_rtmpurl = video_rtmpurl.replace('rtmpe://', 'rtmpt://')
- -
- -        # extract non rtmp videos
- -        if (video_rtmpurl is None) or (video_rtmpurl == ''):
+ +        if mobj:
+ +            video_url = compat_urllib_parse.unquote(mobj.group(1))
+ +            if 'myvideo2flash' in video_url:
+ +                self._downloader.report_warning(u'forcing RTMPT ...')
+ +                video_url = video_url.replace('rtmpe://', 'rtmpt://')
+ +
+ +        if not video_url:
+ +            # extract non rtmp videos
               mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data)
               if mobj is None:
                   raise ExtractorError(u'unable to extract url')
- -            video_rtmpurl = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
+ +            video_url = compat_urllib_parse.unquote(mobj.group(1)) + compat_urllib_parse.unquote(mobj.group(2))
   
- -        mobj = re.search('source=\'(.*?)\'', dec_data)
- -        if mobj is None:
- -            raise ExtractorError(u'unable to extract swfobj')
- -        video_file     = compat_urllib_parse.unquote(mobj.group(1))
+ +        video_file = self._search_regex('source=\'(.*?)\'', dec_data, u'video file')
+ +        video_file = compat_urllib_parse.unquote(video_file)
   
           if not video_file.endswith('f4m'):
               ppath, prefix = video_file.split('.')
@@@ -2173,16 -2153,20 +2173,16 @@@
                   video_filepath + video_file
               ).replace('.f4m', '.m3u8')
   
- -        mobj = re.search('swfobject.embedSWF\(\'(.+?)\'', webpage)
- -        if mobj is None:
- -            raise ExtractorError(u'unable to extract swfobj')
- -        video_swfobj = compat_urllib_parse.unquote(mobj.group(1))
+ +        video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
+ +        video_swfobj = compat_urllib_parse.unquote(video_swfobj)
   
- -        mobj = re.search("<h1(?: class='globalHd')?>(.*?)</h1>", webpage)
- -        if mobj is None:
- -            raise ExtractorError(u'unable to extract title')
- -        video_title = mobj.group(1)
+ +        video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
+ +            webpage, u'title')
   
           return [{
               'id':                 video_id,
- -            'url':                video_rtmpurl,
- -            'tc_url':             video_rtmpurl,
+ +            'url':                video_url,
+ +            'tc_url':             video_url,
               'uploader':           None,
               'upload_date':        None,
               'title':              video_title,
@@@ -2193,7 -2177,6 +2193,7 @@@
               'player_url':         video_swfobj,
           }]
   
+ +
   class ComedyCentralIE(InfoExtractor):
       """Information extractor for The Daily Show and Colbert Report """
   
@@@ -2375,25 -2358,19 +2375,25 @@@ class EscapistIE(InfoExtractor)
           showName = mobj.group('showname')
           videoId = mobj.group('episode')
   
- -        self.report_extraction(showName)
- -        webPage = self._download_webpage(url, showName)
+ +        self.report_extraction(videoId)
+ +        webpage = self._download_webpage(url, videoId)
+ +
+ +        videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
+ +            webpage, u'description', fatal=False)
+ +
+ +        imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
+ +            webpage, u'thumbnail', fatal=False)
+ +
+ +        playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
+ +            webpage, u'player url')
   
- -        descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
- -        description = unescapeHTML(descMatch.group(1))
- -        imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
- -        imgUrl = unescapeHTML(imgMatch.group(1))
- -        playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
- -        playerUrl = unescapeHTML(playerUrlMatch.group(1))
- -        configUrlMatch = re.search('config=(.*)$', playerUrl)
- -        configUrl = compat_urllib_parse.unquote(configUrlMatch.group(1))
+ +        title = self._html_search_regex('<meta name="title" content="([^"]*)"',
+ +            webpage, u'player url').split(' : ')[-1]
   
- -        configJSON = self._download_webpage(configUrl, showName,
+ +        configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
+ +        configUrl = compat_urllib_parse.unquote(configUrl)
+ +
+ +        configJSON = self._download_webpage(configUrl, videoId,
                                               u'Downloading configuration',
                                               u'unable to download configuration')
   
@@@ -2413,10 -2390,10 +2413,10 @@@
               'url': videoUrl,
               'uploader': showName,
               'upload_date': None,
- -            'title': showName,
+ +            'title': title,
               'ext': 'mp4',
               'thumbnail': imgUrl,
- -            'description': description,
+ +            'description': videoDesc,
               'player_url': playerUrl,
           }
   
@@@ -2501,17 -2478,26 +2501,17 @@@ class XVideosIE(InfoExtractor)
   
           self.report_extraction(video_id)
   
- -
           # Extract video URL
- -        mobj = re.search(r'flv_url=(.+?)&', webpage)
- -        if mobj is None:
- -            raise ExtractorError(u'Unable to extract video url')
- -        video_url = compat_urllib_parse.unquote(mobj.group(1))
- -
+ +        video_url = compat_urllib_parse.unquote(self._search_regex(r'flv_url=(.+?)&',
+ +            webpage, u'video URL'))
   
           # Extract title
- -        mobj = re.search(r'<title>(.*?)\s+-\s+XVID', webpage)
- -        if mobj is None:
- -            raise ExtractorError(u'Unable to extract video title')
- -        video_title = mobj.group(1)
- -
+ +        video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
+ +            webpage, u'title')
   
           # Extract video thumbnail
- -        mobj = re.search(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)', webpage)
- -        if mobj is None:
- -            raise ExtractorError(u'Unable to extract video thumbnail')
- -        video_thumbnail = mobj.group(0)
+ +        video_thumbnail = self._search_regex(r'http://(?:img.*?\.)xvideos.com/videos/thumbs/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/[a-fA-F0-9]+/([a-fA-F0-9.]+jpg)',
+ +            webpage, u'thumbnail', fatal=False)
   
           info = {
               'id': video_id,
@@@ -2668,12 -2654,16 +2668,12 @@@ class InfoQIE(InfoExtractor)
           video_url = 'rtmpe://video.infoq.com/cfx/st/' + real_id
   
           # Extract title
- -        mobj = re.search(r'contentTitle = "(.*?)";', webpage)
- -        if mobj is None:
- -            raise ExtractorError(u'Unable to extract video title')
- -        video_title = mobj.group(1)
+ +        video_title = self._search_regex(r'contentTitle = "(.*?)";',
+ +            webpage, u'title')
   
           # Extract description
- -        video_description = u'No description available.'
- -        mobj = re.search(r'<meta name="description" content="(.*)"(?:\s*/)?>', webpage)
- -        if mobj is not None:
- -            video_description = mobj.group(1)
+ +        video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
+ +            webpage, u'description', fatal=False)
   
           video_filename = video_url.split('/')[-1]
           video_id, extension = video_filename.split('.')
@@@ -2844,10 -2834,15 +2844,10 @@@ class StanfordOpenClassroomIE(InfoExtra
                                           note='Downloading course info page',
                                           errnote='Unable to download course info page')
   
- -            m = re.search('<h1>([^<]+)</h1>', coursepage)
- -            if m:
- -                info['title'] = unescapeHTML(m.group(1))
- -            else:
- -                info['title'] = info['id']
+ +            info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
   
- -            m = re.search('<description>([^<]+)</description>', coursepage)
- -            if m:
- -                info['description'] = unescapeHTML(m.group(1))
+ +            info['description'] = self._html_search_regex('<description>([^<]+)</description>',
+ +                coursepage, u'description', fatal=False)
   
               links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
               info['list'] = [
@@@ -2908,17 -2903,25 +2908,17 @@@ class MTVIE(InfoExtractor)
   
           webpage = self._download_webpage(url, video_id)
   
- -        mobj = re.search(r'<meta name="mtv_vt" content="([^"]+)"/>', webpage)
- -        if mobj is None:
- -            raise ExtractorError(u'Unable to extract song name')
- -        song_name = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
- -        mobj = re.search(r'<meta name="mtv_an" content="([^"]+)"/>', webpage)
- -        if mobj is None:
- -            raise ExtractorError(u'Unable to extract performer')
- -        performer = unescapeHTML(mobj.group(1).decode('iso-8859-1'))
- -        video_title = performer + ' - ' + song_name
+ +        song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
+ +            webpage, u'song name', fatal=False)
   
- -        mobj = re.search(r'<meta name="mtvn_uri" content="([^"]+)"/>', webpage)
- -        if mobj is None:
- -            raise ExtractorError(u'Unable to mtvn_uri')
- -        mtvn_uri = mobj.group(1)
+ +        video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
+ +            webpage, u'title')
   
- -        mobj = re.search(r'MTVN.Player.defaultPlaylistId = ([0-9]+);', webpage)
- -        if mobj is None:
- -            raise ExtractorError(u'Unable to extract content id')
- -        content_id = mobj.group(1)
+ +        mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
+ +            webpage, u'mtvn_uri', fatal=False)
+ +
+ +        content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
+ +            webpage, u'content id', fatal=False)
   
           videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
           self.report_extraction(video_id)
@@@ -3066,15 -3069,20 +3066,15 @@@ class XNXXIE(InfoExtractor)
           # Get webpage content
           webpage = self._download_webpage(url, video_id)
   
- -        result = re.search(self.VIDEO_URL_RE, webpage)
- -        if result is None:
- -            raise ExtractorError(u'Unable to extract video url')
- -        video_url = compat_urllib_parse.unquote(result.group(1))
+ +        video_url = self._search_regex(self.VIDEO_URL_RE,
+ +            webpage, u'video URL')
+ +        video_url = compat_urllib_parse.unquote(video_url)
   
- -        result = re.search(self.VIDEO_TITLE_RE, webpage)
- -        if result is None:
- -            raise ExtractorError(u'Unable to extract video title')
- -        video_title = result.group(1)
+ +        video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
+ +            webpage, u'title')
   
- -        result = re.search(self.VIDEO_THUMB_RE, webpage)
- -        if result is None:
- -            raise ExtractorError(u'Unable to extract video thumbnail')
- -        video_thumbnail = result.group(1)
+ +        video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
+ +            webpage, u'thumbnail', fatal=False)
   
           return [{
               'id': video_id,
@@@ -3094,6 -3102,26 +3094,6 @@@ class GooglePlusIE(InfoExtractor)
       _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
       IE_NAME = u'plus.google'
   
- -    def report_extract_entry(self, url):
- -        """Report downloading extry"""
- -        self.to_screen(u'Downloading entry: %s' % url)
- -
- -    def report_date(self, upload_date):
- -        """Report downloading extry"""
- -        self.to_screen(u'Entry date: %s' % upload_date)
- -
- -    def report_uploader(self, uploader):
- -        """Report downloading extry"""
- -        self.to_screen(u'Uploader: %s' % uploader)
- -
- -    def report_title(self, video_title):
- -        """Report downloading extry"""
- -        self.to_screen(u'Title: %s' % video_title)
- -
- -    def report_extract_vid_page(self, video_page):
- -        """Report information extraction."""
- -        self.to_screen(u'Extracting video page: %s' % video_page)
- -
       def _real_extract(self, url):
           # Extract id from URL
           mobj = re.match(self._VALID_URL, url)
@@@ -3106,31 -3134,47 +3106,31 @@@
           video_extension = 'flv'
   
           # Step 1, Retrieve post webpage to extract further information
- -        self.report_extract_entry(post_url)
           webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
   
+ +        self.report_extraction(video_id)
+ +
           # Extract update date
- -        upload_date = None
- -        pattern = 'title="Timestamp">(.*?)</a>'
- -        mobj = re.search(pattern, webpage)
- -        if mobj:
- -            upload_date = mobj.group(1)
+ +        upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
+ +            webpage, u'upload date', fatal=False)
+ +        if upload_date:
               # Convert timestring to a format suitable for filename
               upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
               upload_date = upload_date.strftime('%Y%m%d')
- -        self.report_date(upload_date)
   
           # Extract uploader
- -        uploader = None
- -        pattern = r'rel\="author".*?>(.*?)</a>'
- -        mobj = re.search(pattern, webpage)
- -        if mobj:
- -            uploader = mobj.group(1)
- -        self.report_uploader(uploader)
+ +        uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
+ +            webpage, u'uploader', fatal=False)
   
           # Extract title
           # Get the first line for title
- -        video_title = u'NA'
- -        pattern = r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]'
- -        mobj = re.search(pattern, webpage)
- -        if mobj:
- -            video_title = mobj.group(1)
- -        self.report_title(video_title)
+ +        video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
+ +            webpage, 'title', default=u'NA')
   
           # Step 2, Stimulate clicking the image box to launch video
- -        pattern = '"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]'
- -        mobj = re.search(pattern, webpage)
- -        if mobj is None:
- -            raise ExtractorError(u'Unable to extract video page URL')
- -
- -        video_page = mobj.group(1)
+ +        video_page = self._search_regex('href="(https\://plus\.google\.com/photos/.*?)"',
+ +            webpage, u'video page URL')
           webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
- -        self.report_extract_vid_page(video_page)
- -
   
           # Extract video links on video page
           """Extract video links of all sizes"""
@@@ -3163,7 -3207,7 +3163,7 @@@
           }]
   
   class NBAIE(InfoExtractor):
- -    _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
+ +    _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
       IE_NAME = u'nba'
   
       def _real_extract(self, url):
@@@ -3172,27 -3216,28 +3172,27 @@@
               raise ExtractorError(u'Invalid URL: %s' % url)
   
           video_id = mobj.group(1)
- -        if video_id.endswith('/index.html'):
- -            video_id = video_id[:-len('/index.html')]
   
           webpage = self._download_webpage(url, video_id)
   
           video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
- -        def _findProp(rexp, default=None):
- -            m = re.search(rexp, webpage)
- -            if m:
- -                return unescapeHTML(m.group(1))
- -            else:
- -                return default
   
           shortened_video_id = video_id.rpartition('/')[2]
- -        title = _findProp(r'<meta property="og:title" content="(.*?)"', shortened_video_id).replace('NBA.com: ', '')
+ +        title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
+ +            webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
+ +
+ +        # It isn't there in the HTML it returns to us
+ +        # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
+ +
+ +        description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
+ +
           info = {
               'id': shortened_video_id,
               'url': video_url,
               'ext': 'mp4',
               'title': title,
- -            'uploader_date': _findProp(r'<b>Date:</b> (.*?)</div>'),
- -            'description': _findProp(r'<div class="description">(.*?)</h1>'),
+ +            # 'uploader_date': uploader_date,
+ +            'description': description,
           }
           return [info]
   
@@@ -3340,21 -3385,30 +3340,21 @@@ class FunnyOrDieIE(InfoExtractor)
           video_id = mobj.group('id')
           webpage = self._download_webpage(url, video_id)
   
- -        m = re.search(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"', webpage, re.DOTALL)
- -        if not m:
- -            raise ExtractorError(u'Unable to find video information')
- -        video_url = unescapeHTML(m.group('url'))
+ +        video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
+ +            webpage, u'video URL', flags=re.DOTALL)
   
- -        m = re.search(r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>", webpage, flags=re.DOTALL)
- -        if not m:
- -            m = re.search(r'<title>(?P<title>[^<]+?)</title>', webpage)
- -            if not m:
- -                raise ExtractorError(u'Cannot find video title')
- -        title = clean_html(m.group('title'))
+ +        title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
+ +            r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
   
- -        m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
- -        if m:
- -            desc = unescapeHTML(m.group('desc'))
- -        else:
- -            desc = None
+ +        video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
+ +            webpage, u'description', fatal=False, flags=re.DOTALL)
   
           info = {
               'id': video_id,
               'url': video_url,
               'ext': 'mp4',
               'title': title,
- -            'description': desc,
+ +            'description': video_description,
           }
           return [info]
   
@@@ -3410,29 -3464,27 +3410,29 @@@ class UstreamIE(InfoExtractor)
       def _real_extract(self, url):
           m = re.match(self._VALID_URL, url)
           video_id = m.group('videoID')
+ +
           video_url = u'http://tcdn.ustream.tv/video/%s' % video_id
           webpage = self._download_webpage(url, video_id)
+ +
           self.report_extraction(video_id)
- -        try:
- -            m = re.search(r'data-title="(?P<title>.+)"',webpage)
- -            title = m.group('title')
- -            m = re.search(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
- -                          webpage, re.DOTALL)
- -            uploader = unescapeHTML(m.group('uploader').strip())
- -            m = re.search(r'<link rel="image_src" href="(?P<thumb>.*?)"', webpage)
- -            thumb = m.group('thumb')
- -        except AttributeError:
- -            raise ExtractorError(u'Unable to extract info')
+ +
+ +        video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
+ +            webpage, u'title')
+ +
+ +        uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
+ +            webpage, u'uploader', fatal=False, flags=re.DOTALL)
+ +
+ +        thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
+ +            webpage, u'thumbnail', fatal=False)
+ +
           info = {
- -                'id':video_id,
- -                'url':video_url,
+ +                'id': video_id,
+ +                'url': video_url,
                   'ext': 'flv',
- -                'title': title,
+ +                'title': video_title,
                   'uploader': uploader,
- -                'thumbnail': thumb,
- -                  }
+ +                'thumbnail': thumbnail,
+ +               }
           return info
   
   class WorldStarHipHopIE(InfoExtractor):
@@@ -3440,36 -3492,45 +3440,36 @@@
       IE_NAME = u'WorldStarHipHop'
   
       def _real_extract(self, url):
- -        _src_url = r'so\.addVariable\("file","(.*?)"\)'
- -
           m = re.match(self._VALID_URL, url)
           video_id = m.group('id')
   
- -        webpage_src = self._download_webpage(url, video_id) 
+ +        webpage_src = self._download_webpage(url, video_id)
   
- -        mobj = re.search(_src_url, webpage_src)
+ +        video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
+ +            webpage_src, u'video URL')
   
- -        if mobj is not None:
- -            video_url = mobj.group(1)
- -            if 'mp4' in video_url:
- -                ext = 'mp4'
- -            else:
- -                ext = 'flv'
+ +        if 'mp4' in video_url:
+ +            ext = 'mp4'
           else:
- -            raise ExtractorError(u'Cannot find video url for %s' % video_id)
- -
- -        mobj = re.search(r"<title>(.*)</title>", webpage_src)
+ +            ext = 'flv'
   
- -        if mobj is None:
- -            raise ExtractorError(u'Cannot determine title')
- -        title = mobj.group(1)
+ +        video_title = self._html_search_regex(r"<title>(.*)</title>",
+ +            webpage_src, u'title')
   
- -        mobj = re.search(r'rel="image_src" href="(.*)" />', webpage_src)
           # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
- -        if mobj is not None:
- -            thumbnail = mobj.group(1)
- -        else:
+ +        thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
+ +            webpage_src, u'thumbnail', fatal=False)
+ +
+ +        if not thumbnail:
               _title = r"""candytitles.*>(.*)</span>"""
               mobj = re.search(_title, webpage_src)
               if mobj is not None:
- -                title = mobj.group(1)
- -            thumbnail = None
+ +                video_title = mobj.group(1)
   
           results = [{
                       'id': video_id,
                       'url' : video_url,
- -                    'title' : title,
+ +                    'title' : video_title,
                       'thumbnail' : thumbnail,
                       'ext' : ext,
                       }]
@@@ -3483,9 -3544,10 +3483,9 @@@ class RBMARadioIE(InfoExtractor)
           video_id = m.group('videoID')
   
           webpage = self._download_webpage(url, video_id)
- -        m = re.search(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>', webpage)
- -        if not m:
- -            raise ExtractorError(u'Cannot find metadata')
- -        json_data = m.group(1)
+ +
+ +        json_data = self._search_regex(r'<script>window.gon = {.*?};gon\.show=(.+?);</script>',
+ +            webpage, u'json data')
   
           try:
               data = json.loads(json_data)
@@@ -3532,33 -3594,42 +3532,33 @@@ class YouPornIE(InfoExtractor)
           mobj = re.match(self._VALID_URL, url)
           if mobj is None:
               raise ExtractorError(u'Invalid URL: %s' % url)
- -
           video_id = mobj.group('videoid')
   
           req = compat_urllib_request.Request(url)
           req.add_header('Cookie', 'age_verified=1')
           webpage = self._download_webpage(req, video_id)
   
- -        # Get the video title
- -        result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage)
- -        if result is None:
- -            raise ExtractorError(u'Unable to extract video title')
- -        video_title = result.group('title').strip()
- -
- -        # Get the video date
- -        result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage)
- -        if result is None:
- -            self._downloader.report_warning(u'unable to extract video date')
- -            upload_date = None
- -        else:
- -            upload_date = unified_strdate(result.group('date').strip())
+ +        # Get JSON parameters
+ +        json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
+ +        try:
+ +            params = json.loads(json_params)
+ +        except:
+ +            raise ExtractorError(u'Invalid JSON')
   
- -        # Get the video uploader
- -        result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage)
- -        if result is None:
- -            self._downloader.report_warning(u'unable to extract uploader')
- -            video_uploader = None
- -        else:
- -            video_uploader = result.group('uploader').strip()
- -            video_uploader = clean_html( video_uploader )
+ +        self.report_extraction(video_id)
+ +        try:
+ +            video_title = params['title']
+ +            upload_date = unified_strdate(params['release_date_f'])
+ +            video_description = params['description']
+ +            video_uploader = params['submitted_by']
+ +            thumbnail = params['thumbnails'][0]['image']
+ +        except KeyError:
+ +            raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
   
           # Get all of the formats available
           DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
- -        result = re.search(DOWNLOAD_LIST_RE, webpage)
- -        if result is None:
- -            raise ExtractorError(u'Unable to extract download list')
- -        download_list_html = result.group('download_list').strip()
+ +        download_list_html = self._search_regex(DOWNLOAD_LIST_RE,
+ +            webpage, u'download list').strip()
   
           # Get all of the links from the page
           LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">'
@@@ -3582,18 -3653,19 +3582,18 @@@
               size = format[0]
               bitrate = format[1]
               format = "-".join( format )
- -            title = u'%s-%s-%s' % (video_title, size, bitrate)
+ +            # title = u'%s-%s-%s' % (video_title, size, bitrate)
   
               formats.append({
                   'id': video_id,
                   'url': video_url,
                   'uploader': video_uploader,
                   'upload_date': upload_date,
- -                'title': title,
+ +                'title': video_title,
                   'ext': extension,
                   'format': format,
- -                'thumbnail': None,
- -                'description': None,
- -                'player_url': None
+ +                'thumbnail': thumbnail,
+ +                'description': video_description
               })
   
           if self._downloader.params.get('listformats', None):
@@@ -3634,13 -3706,17 +3634,13 @@@ class PornotubeIE(InfoExtractor)
   
           # Get the video URL
           VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",'
- -        result = re.search(VIDEO_URL_RE, webpage)
- -        if result is None:
- -            raise ExtractorError(u'Unable to extract video url')
- -        video_url = compat_urllib_parse.unquote(result.group('url'))
+ +        video_url = self._search_regex(VIDEO_URL_RE, webpage, u'video url')
+ +        video_url = compat_urllib_parse.unquote(video_url)
   
           #Get the uploaded date
           VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
- -        result = re.search(VIDEO_UPLOADED_RE, webpage)
- -        if result is None:
- -            raise ExtractorError(u'Unable to extract video title')
- -        upload_date = unified_strdate(result.group('date'))
+ +        upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
+ +        if upload_date: upload_date = unified_strdate(upload_date)
   
           info = {'id': video_id,
                   'url': video_url,
@@@ -3667,8 -3743,10 +3667,8 @@@ class YouJizzIE(InfoExtractor)
           webpage = self._download_webpage(url, video_id)
   
           # Get the video title
- -        result = re.search(r'<title>(?P<title>.*)</title>', webpage)
- -        if result is None:
- -            raise ExtractorError(u'ERROR: unable to extract video title')
- -        video_title = result.group('title').strip()
+ +        video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
+ +            webpage, u'title').strip()
   
           # Get the embed page
           result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
@@@ -3681,8 -3759,10 +3681,8 @@@
           webpage = self._download_webpage(embed_page_url, video_id)
   
           # Get the video URL
- -        result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);', webpage)
- -        if result is None:
- -            raise ExtractorError(u'ERROR: unable to extract video url')
- -        video_url = result.group('source')
+ +        video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
+ +            webpage, u'video URL')
   
           info = {'id': video_id,
                   'url': video_url,
@@@ -3705,7 -3785,10 +3705,7 @@@ class EightTracksIE(InfoExtractor)
   
           webpage = self._download_webpage(url, playlist_id)
   
- -        m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL)
- -        if not m:
- -            raise ExtractorError(u'Cannot find trax information')
- -        json_like = m.group(1)
+ +        json_like = self._search_regex(r"PAGE.mix = (.*?);\n", webpage, u'trax information', flags=re.DOTALL)
           data = json.loads(json_like)
   
           session = str(random.randint(0, 1000000000))
@@@ -3741,22 -3824,18 +3741,22 @@@ class KeekIE(InfoExtractor)
       def _real_extract(self, url):
           m = re.match(self._VALID_URL, url)
           video_id = m.group('videoID')
+ +
           video_url = u'http://cdn.keek.com/keek/video/%s' % video_id
           thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
           webpage = self._download_webpage(url, video_id)
- -        m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
- -        title = unescapeHTML(m.group('title'))
- -        m = re.search(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>', webpage)
- -        uploader = clean_html(m.group('uploader'))
+ +
+ +        video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
+ +            webpage, u'title')
+ +
+ +        uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
+ +            webpage, u'uploader', fatal=False)
+ +
           info = {
                   'id': video_id,
                   'url': video_url,
                   'ext': 'mp4',
- -                'title': title,
+ +                'title': video_title,
                   'thumbnail': thumbnail,
                   'uploader': uploader
           }
@@@ -3903,9 -3982,10 +3903,9 @@@ class SpiegelIE(InfoExtractor)
           video_id = m.group('videoID')
   
           webpage = self._download_webpage(url, video_id)
- -        m = re.search(r'<div class="module-title">(.*?)</div>', webpage)
- -        if not m:
- -            raise ExtractorError(u'Cannot find title')
- -        video_title = unescapeHTML(m.group(1))
+ +
+ +        video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
+ +            webpage, u'title')
   
           xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
           xml_code = self._download_webpage(xml_url, video_id,
@@@ -3941,25 -4021,35 +3941,25 @@@ class LiveLeakIE(InfoExtractor)
   
           webpage = self._download_webpage(url, video_id)
   
- -        m = re.search(r'file: "(.*?)",', webpage)
- -        if not m:
- -            raise ExtractorError(u'Unable to find video url')
- -        video_url = m.group(1)
+ +        video_url = self._search_regex(r'file: "(.*?)",',
+ +            webpage, u'video URL')
   
- -        m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage)
- -        if not m:
- -            raise ExtractorError(u'Cannot find video title')
- -        title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip()
+ +        video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
+ +            webpage, u'title').replace('LiveLeak.com -', '').strip()
   
- -        m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage)
- -        if m:
- -            desc = unescapeHTML(m.group('desc'))
- -        else:
- -            desc = None
+ +        video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
+ +            webpage, u'description', fatal=False)
   
- -        m = re.search(r'By:.*?(\w+)</a>', webpage)
- -        if m:
- -            uploader = clean_html(m.group(1))
- -        else:
- -            uploader = None
+ +        video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
+ +            webpage, u'uploader', fatal=False)
   
           info = {
               'id':  video_id,
               'url': video_url,
               'ext': 'mp4',
- -            'title': title,
- -            'description': desc,
- -            'uploader': uploader
+ +            'title': video_title,
+ +            'description': video_description,
+ +            'uploader': video_uploader
           }
   
           return [info]
@@@ -4075,23 -4165,23 +4075,23 @@@ class TumblrIE(InfoExtractor)
           re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id)
           video = re.search(re_video, webpage)
           if video is None:
- -            self.to_screen("No video found")
- -            return []
+ +           raise ExtractorError(u'Unable to extract video')
           video_url = video.group('video_url')
           ext = video.group('ext')
   
- -        re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22'  # We pick the first poster
- -        thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '')
+ +        video_thumbnail = self._search_regex(r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22',
+ +            webpage, u'thumbnail', fatal=False)  # We pick the first poster
+ +        if video_thumbnail: video_thumbnail = video_thumbnail.replace('\\', '')
   
           # The only place where you can get a title, it's not complete,
           # but searching in other places doesn't work for all videos
- -        re_title = r'<title>(?P<title>.*?)</title>'
- -        title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title'))
+ +        video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
+ +            webpage, u'title', flags=re.DOTALL)
   
           return [{'id': video_id,
                    'url': video_url,
- -                 'title': title,
- -                 'thumbnail': thumb,
+ +                 'title': video_title,
+ +                 'thumbnail': video_thumbnail,
                    'ext': ext
                    }]
   
@@@ -4105,7 -4195,7 +4105,7 @@@ class BandcampIE(InfoExtractor)
           # We get the link to the free download page
           m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
           if m_download is None:
- -            raise ExtractorError(u'No free songs founded')
+ +            raise ExtractorError(u'No free songs found')
   
           download_link = m_download.group(1)
           id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', 
@@@ -4133,10 -4223,10 +4133,10 @@@
   
           track_info = {'id':id,
                         'title' : info[u'title'],
- -                      'ext' : 'mp3',
- -                      'url' : final_url,
+ +                      'ext' :   'mp3',
+ +                      'url' :   final_url,
                         'thumbnail' : info[u'thumb_url'],
- -                      'uploader' : info[u'artist']
+ +                      'uploader' :  info[u'artist']
                         }
   
           return [track_info]
@@@ -4153,14 -4243,17 +4153,14 @@@ class RedTubeIE(InfoExtractor)
           video_id = mobj.group('id')
           video_extension = 'mp4'        
           webpage = self._download_webpage(url, video_id)
+ +
           self.report_extraction(video_id)
- -        mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage)
   
- -        if mobj is None:
- -            raise ExtractorError(u'Unable to extract media URL')
+ +        video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
+ +            webpage, u'video URL')
   
- -        video_url = mobj.group(1)
- -        mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage)
- -        if mobj is None:
- -            raise ExtractorError(u'Unable to extract title')
- -        video_title = mobj.group(1)
+ +        video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
+ +            webpage, u'title')
   
           return [{
               'id':       video_id,
@@@ -4181,13 -4274,15 +4181,13 @@@ class InaIE(InfoExtractor)
           video_extension = 'mp4'
           webpage = self._download_webpage(mrss_url, video_id)
   
- -        mobj = re.search(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', webpage)
- -        if mobj is None:
- -            raise ExtractorError(u'Unable to extract media URL')
- -        video_url = mobj.group(1)
+ +        self.report_extraction(video_id)
   
- -        mobj = re.search(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>', webpage)
- -        if mobj is None:
- -            raise ExtractorError(u'Unable to extract title')
- -        video_title = mobj.group(1)
+ +        video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
+ +            webpage, u'video URL')
+ +
+ +        video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
+ +            webpage, u'title')
   
           return [{
               'id':       video_id,
@@@ -4209,17 -4304,27 +4209,17 @@@ class HowcastIE(InfoExtractor)
   
           self.report_extraction(video_id)
   
- -        mobj = re.search(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)"', webpage)
- -        if mobj is None:
- -            raise ExtractorError(u'Unable to extract video URL')
- -        video_url = mobj.group(1)
+ +        video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
+ +            webpage, u'video URL')
   
- -        mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'', webpage)
- -        if mobj is None:
- -            raise ExtractorError(u'Unable to extract title')
- -        video_title = mobj.group(1) or mobj.group(2)
+ +        video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
+ +            webpage, u'title')
   
- -        mobj = re.search(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'', webpage)
- -        if mobj is None:
- -            self._downloader.report_warning(u'unable to extract description')
- -            video_description = None
- -        else:
- -            video_description = mobj.group(1) or mobj.group(2)
+ +        video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
+ +            webpage, u'description', fatal=False)
   
- -        mobj = re.search(r'<meta content=\'(.+?)\' property=\'og:image\'', webpage)
- -        if mobj is None:
- -            raise ExtractorError(u'Unable to extract thumbnail')
- -        thumbnail = mobj.group(1)
+ +        thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
+ +            webpage, u'thumbnail', fatal=False)
   
           return [{
               'id':       video_id,
@@@ -4235,6 -4340,7 +4235,6 @@@ class VineIE(InfoExtractor)
       _VALID_URL = r'(?:https?://)?(?:www\.)?vine\.co/v/(?P<id>\w+)'
   
       def _real_extract(self, url):
- -
           mobj = re.match(self._VALID_URL, url)
   
           video_id = mobj.group('id')
@@@ -4243,17 -4349,25 +4243,17 @@@
   
           self.report_extraction(video_id)
   
- -        mobj = re.search(r'<meta property="twitter:player:stream" content="(.+?)"', webpage)
- -        if mobj is None:
- -            raise ExtractorError(u'Unable to extract video URL')
- -        video_url = mobj.group(1)
+ +        video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
+ +            webpage, u'video URL')
   
- -        mobj = re.search(r'<meta property="og:title" content="(.+?)"', webpage)
- -        if mobj is None:
- -            raise ExtractorError(u'Unable to extract title')
- -        video_title = mobj.group(1)
+ +        video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
+ +            webpage, u'title')
   
- -        mobj = re.search(r'<meta property="og:image" content="(.+?)(\?.*?)?"', webpage)
- -        if mobj is None:
- -            raise ExtractorError(u'Unable to extract thumbnail')
- -        thumbnail = mobj.group(1)
+ +        thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
+ +            webpage, u'thumbnail', fatal=False)
   
- -        mobj = re.search(r'<div class="user">.*?<h2>(.+?)</h2>', webpage, re.DOTALL)
- -        if mobj is None:
- -            raise ExtractorError(u'Unable to extract uploader')
- -        uploader = mobj.group(1)
+ +        uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
+ +            webpage, u'uploader', fatal=False, flags=re.DOTALL)
   
           return [{
               'id':        video_id,
@@@ -4276,13 -4390,18 +4276,13 @@@ class FlickrIE(InfoExtractor)
           webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id
           webpage = self._download_webpage(webpage_url, video_id)
   
- -        mobj = re.search(r"photo_secret: '(\w+)'", webpage)
- -        if mobj is None:
- -            raise ExtractorError(u'Unable to extract video secret')
- -        secret = mobj.group(1)
+ +        secret = self._search_regex(r"photo_secret: '(\w+)'", webpage, u'secret')
   
           first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
           first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
   
- -        mobj = re.search(r'<Item id="id">(\d+-\d+)</Item>', first_xml)
- -        if mobj is None:
- -            raise ExtractorError(u'Unable to extract node_id')
- -        node_id = mobj.group(1)
+ +        node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
+ +            first_xml, u'node_id')
   
           second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
           second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage')
@@@ -4294,14 -4413,22 +4294,14 @@@
               raise ExtractorError(u'Unable to extract video url')
           video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
   
- -        mobj = re.search(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
- -        if mobj is None:
- -            raise ExtractorError(u'Unable to extract title')
- -        video_title = mobj.group(1) or mobj.group(2)
+ +        video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
+ +            webpage, u'video title')
   
- -        mobj = re.search(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
- -        if mobj is None:
- -            self._downloader.report_warning(u'unable to extract description')
- -            video_description = None
- -        else:
- -            video_description = mobj.group(1) or mobj.group(2)
+ +        video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
+ +            webpage, u'description', fatal=False)
   
- -        mobj = re.search(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')', webpage)
- -        if mobj is None:
- -            raise ExtractorError(u'Unable to extract thumbnail')
- -        thumbnail = mobj.group(1) or mobj.group(2)
+ +        thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
+ +            webpage, u'thumbnail', fatal=False)
   
           return [{
               'id':          video_id,
@@@ -4323,25 -4450,32 +4323,25 @@@ class TeamcocoIE(InfoExtractor)
           url_title = mobj.group('url_title')
           webpage = self._download_webpage(url, url_title)
   
- -        mobj = re.search(r'<article class="video" data-id="(\d+?)"', webpage)
- -        video_id = mobj.group(1)
+ +        video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
+ +            webpage, u'video id')
   
           self.report_extraction(video_id)
   
- -        mobj = re.search(r'<meta property="og:title" content="(.+?)"', webpage)
- -        if mobj is None:
- -            raise ExtractorError(u'Unable to extract title')
- -        video_title = mobj.group(1)
+ +        video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
+ +            webpage, u'title')
   
- -        mobj = re.search(r'<meta property="og:image" content="(.+?)"', webpage)
- -        if mobj is None:
- -            raise ExtractorError(u'Unable to extract thumbnail')
- -        thumbnail = mobj.group(1)
+ +        thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
+ +            webpage, u'thumbnail', fatal=False)
   
- -        mobj = re.search(r'<meta property="og:description" content="(.*?)"', webpage)
- -        if mobj is None:
- -            raise ExtractorError(u'Unable to extract description')
- -        description = mobj.group(1)
+ +        video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
+ +            webpage, u'description', fatal=False)
   
           data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
           data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
- -        mobj = re.search(r'<file type="high".*?>(.*?)</file>', data)
- -        if mobj is None:
- -            raise ExtractorError(u'Unable to extract video url')
- -        video_url = mobj.group(1)
+ +
+ +        video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
+ +            data, u'video URL')
   
           return [{
               'id':          video_id,
@@@ -4349,9 -4483,9 +4349,9 @@@
               'ext':         'mp4',
               'title':       video_title,
               'thumbnail':   thumbnail,
- -            'description': description,
+ +            'description': video_description,
           }]
- -        
+ +
   class XHamsterIE(InfoExtractor):
       """Information Extractor for xHamster"""
       _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
@@@ -4360,9 -4494,8 +4360,9 @@@
           mobj = re.match(self._VALID_URL, url)
   
           video_id = mobj.group('id')
- -        mrss_url='http://xhamster.com/movies/%s/.html' % video_id
+ +        mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
           webpage = self._download_webpage(mrss_url, video_id)
+ +
           mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
           if mobj is None:
               raise ExtractorError(u'Unable to extract media URL')
@@@ -4372,33 -4505,39 +4372,33 @@@
               video_url = mobj.group('server')+'/key='+mobj.group('file')
           video_extension = video_url.split('.')[-1]
   
- -        mobj = re.search(r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage)
- -        if mobj is None:
- -            raise ExtractorError(u'Unable to extract title')
- -        video_title = unescapeHTML(mobj.group('title'))
+ +        video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
+ +            webpage, u'title')
   
- -        mobj = re.search(r'<span>Description: </span>(?P<description>[^<]+)', webpage)
- -        if mobj is None:
- -            video_description = u''
- -        else:
- -            video_description = unescapeHTML(mobj.group('description'))
+ +        # Can't see the description anywhere in the UI
+ +        # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
+ +        #     webpage, u'description', fatal=False)
+ +        # if video_description: video_description = unescapeHTML(video_description)
   
           mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
- -        if mobj is None:
- -            raise ExtractorError(u'Unable to extract upload date')
- -        video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
- -
- -        mobj = re.search(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^>]+)', webpage)
- -        if mobj is None:
- -            video_uploader_id = u'anonymous'
+ +        if mobj:
+ +            video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
           else:
- -            video_uploader_id = mobj.group('uploader_id')
+ +            video_upload_date = None
+ +            self._downloader.report_warning(u'Unable to extract upload date')
   
- -        mobj = re.search(r'\'image\':\'(?P<thumbnail>[^\']+)\'', webpage)
- -        if mobj is None:
- -            raise ExtractorError(u'Unable to extract thumbnail URL')
- -        video_thumbnail = mobj.group('thumbnail')
+ +        video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
+ +            webpage, u'uploader id', default=u'anonymous')
+ +
+ +        video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
+ +            webpage, u'thumbnail', fatal=False)
   
           return [{
               'id':       video_id,
               'url':      video_url,
               'ext':      video_extension,
               'title':    video_title,
- -            'description': video_description,
+ +            # 'description': video_description,
               'upload_date': video_upload_date,
               'uploader_id': video_uploader_id,
               'thumbnail': video_thumbnail
@@@ -4422,9 -4561,10 +4422,9 @@@ class HypemIE(InfoExtractor)
           cookie = urlh.headers.get('Set-Cookie', '')
   
           self.report_extraction(track_id)
- -        mobj = re.search(r'<script type="application/json" id="displayList-data">(.*?)</script>', response, flags=re.MULTILINE|re.DOTALL)
- -        if mobj is None:
- -            raise ExtractorError(u'Unable to extrack tracks')
- -        html_tracks = mobj.group(1).strip()
+ +
+ +        html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
+ +            response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
           try:
               track_list = json.loads(html_tracks)
               track = track_list[u'tracks'][0]
@@@ -4465,12 -4605,11 +4465,12 @@@ class Vbox7IE(InfoExtractor)
           video_id = mobj.group(1)
   
           redirect_page, urlh = self._download_webpage_handle(url, video_id)
- -        redirect_url = urlh.geturl() + re.search(r'window\.location = \'(.*)\';', redirect_page).group(1)
+ +        new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
+ +        redirect_url = urlh.geturl() + new_location
           webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
   
- -        title = re.search(r'<title>(.*)</title>', webpage)
- -        title = (title.group(1)).split('/')[0].strip()
+ +        title = self._html_search_regex(r'<title>(.*)</title>',
+ +            webpage, u'title').split('/')[0].strip()
   
           ext = "flv"
           info_url = "http://vbox7.com/play/magare.do"
@@@ -4490,57 -4629,6 +4490,57 @@@
               'thumbnail': thumbnail_url,
           }]
   
+ +class GametrailersIE(InfoExtractor):
+ +    _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
+ +
+ +    def _real_extract(self, url):
+ +        mobj = re.match(self._VALID_URL, url)
+ +        if mobj is None:
+ +            raise ExtractorError(u'Invalid URL: %s' % url)
+ +        video_id = mobj.group('id')
+ +        video_type = mobj.group('type')
+ +        webpage = self._download_webpage(url, video_id)
+ +        if video_type == 'full-episodes':
+ +            mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
+ +        else:
+ +            mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
+ +        mgid = self._search_regex(mgid_re, webpage, u'mgid')
+ +        data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
+ +
+ +        info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
+ +                                           video_id, u'Downloading video info')
+ +        links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
+ +                                               video_id, u'Downloading video urls info')
+ +
+ +        self.report_extraction(video_id)
+ +        info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
+ +                      <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
+ +                      <image>.*
+ +                        <url>(?P<thumb>.*?)</url>.*
+ +                      </image>'''
+ +
+ +        m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
+ +        if m_info is None:
+ +            raise ExtractorError(u'Unable to extract video info')
+ +        video_title = m_info.group('title')
+ +        video_description = m_info.group('description')
+ +        video_thumb = m_info.group('thumb')
+ +
+ +        m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
+ +        if m_urls is None or len(m_urls) == 0:
+ +            raise ExtractError(u'Unable to extrat video url')
+ +        # They are sorted from worst to best quality
+ +        video_url = m_urls[-1].group('url')
+ +
+ +        return {'url':         video_url,
+ +                'id':          video_id,
+ +                'title':       video_title,
+ +                # Videos are actually flv not mp4
+ +                'ext':         'flv',
+ +                'thumbnail':   video_thumb,
+ +                'description': video_description,
+ +                }
+ +
   def gen_extractors():
       """ Return a list of an instance of every supported extractor.
       The order does matter; the first extractor matched is the one handling the URL.
@@@ -4606,7 -4694,6 +4606,7 @@@
           XHamsterIE(),
           HypemIE(),
           Vbox7IE(),
+ +        GametrailersIE(),
           GenericIE()
       ]
author	Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
	Wed, 19 Jun 2013 10:51:26 +0000 (12:51 +0200)
committer	Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
	Wed, 19 Jun 2013 10:52:44 +0000 (12:52 +0200)
		1	2
test/test_youtube_lists.py	patch \|	diff1 \|	diff2 \|	blob \| history
youtube_dl/InfoExtractors.py	patch \|	diff1 \|	diff2 \|	blob \| history