add _search_regex to the new IEs

[youtube-dl] / youtube_dl / InfoExtractors.py
diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py

index fbf40f3ca92248574f3fb29d9a2f22e423ca24ba..2f926f24363947d38c129af8828cfab27f3dad7f 100755 (executable)
--- a/youtube_dl/InfoExtractors.py
+++ b/youtube_dl/InfoExtractors.py
@@ -216,13 +216,22 @@ class InfoExtractor(object):
          elif default is not None:
              return default
          elif fatal:
-            raise ExtractorError(u'Unable to extract %s; '
-                u'please report this issue on GitHub.' % _name)
+            raise ExtractorError(u'Unable to extract %s' % _name)
          else:
              self._downloader.report_warning(u'unable to extract %s; '
                  u'please report this issue on GitHub.' % _name)
              return None
  
+    def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
+        """
+        Like _search_regex, but strips HTML tags and unescapes entities.
+        """
+        res = self._search_regex(pattern, string, name, default, fatal, flags)
+        if res:
+            return clean_html(res).strip()
+        else:
+            return res
+
  class SearchInfoExtractor(InfoExtractor):
      """
      Base class for paged search queries extractors.
@@ -1400,6 +1409,9 @@ class GenericIE(InfoExtractor):
          if mobj is None:
              # Broaden the search a little bit: JWPlayer JS loader
              mobj = re.search(r'[^A-Za-z0-9]?file:\s*["\'](http[^\'"&]*)', webpage)
+        if mobj is None:
+            # Try to find twitter cards info
+            mobj = re.search(r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)
          if mobj is None:
              raise ExtractorError(u'Invalid URL: %s' % url)
  
@@ -1421,16 +1433,12 @@ class GenericIE(InfoExtractor):
          #   Site Name | Video Title
          #   Video Title - Tagline | Site Name
          # and so on and so forth; it's just not practical
-        mobj = re.search(r'<title>(.*)</title>', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract title')
-        video_title = mobj.group(1)
+        video_title = self._html_search_regex(r'<title>(.*)</title>',
+            webpage, u'video title')
  
          # video uploader is domain name
-        mobj = re.match(r'(?:https?://)?([^/]*)/.*', url)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract title')
-        video_uploader = mobj.group(1)
+        video_uploader = self._search_regex(r'(?:https?://)?([^/]*)/.*',
+            url, u'video uploader')
  
          return [{
              'id':       video_id,
@@ -1451,7 +1459,6 @@ class YoutubeSearchIE(SearchInfoExtractor):
  
      def report_download_page(self, query, pagenum):
          """Report attempt to download search page with given number."""
-        query = query.decode(preferredencoding())
          self._downloader.to_screen(u'[youtube] query "%s": Downloading page %s' % (query, pagenum))
  
      def _get_n_results(self, query, n):
@@ -1924,9 +1931,8 @@ class FacebookIE(InfoExtractor):
          video_duration = int(video_data['video_duration'])
          thumbnail = video_data['thumbnail_src']
  
-        video_title = self._search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
+        video_title = self._html_search_regex('<h2 class="uiHeaderTitle">([^<]+)</h2>',
              webpage, u'title')
-        video_title = unescapeHTML(video_title)
  
          info = {
              'id': video_id,
@@ -2088,7 +2094,7 @@ class MyVideoIE(InfoExtractor):
              self.report_extraction(video_id)
              video_url = mobj.group(1) + '.flv'
  
-            video_title = self._search_regex('<title>([^<]+)</title>',
+            video_title = self._html_search_regex('<title>([^<]+)</title>',
                  webpage, u'title')
  
              video_ext = self._search_regex('[.](.+?)$', video_url, u'extension')
@@ -2170,7 +2176,7 @@ class MyVideoIE(InfoExtractor):
          video_swfobj = self._search_regex('swfobject.embedSWF\(\'(.+?)\'', webpage, u'swfobj')
          video_swfobj = compat_urllib_parse.unquote(video_swfobj)
  
-        video_title = self._search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
+        video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>",
              webpage, u'title')
  
          return [{
@@ -2369,25 +2375,25 @@ class EscapistIE(InfoExtractor):
          showName = mobj.group('showname')
          videoId = mobj.group('episode')
  
-        self.report_extraction(showName)
-        webpage = self._download_webpage(url, showName)
+        self.report_extraction(videoId)
+        webpage = self._download_webpage(url, videoId)
  
-        videoDesc = self._search_regex('<meta name="description" content="([^"]*)"',
+        videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
              webpage, u'description', fatal=False)
-        if videoDesc: videoDesc = unescapeHTML(videoDesc)
  
-        imgUrl = self._search_regex('<meta property="og:image" content="([^"]*)"',
+        imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
              webpage, u'thumbnail', fatal=False)
-        if imgUrl: imgUrl = unescapeHTML(imgUrl)
  
-        playerUrl = self._search_regex('<meta property="og:video" content="([^"]*)"',
+        playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
              webpage, u'player url')
-        playerUrl = unescapeHTML(playerUrl)
+
+        title = self._html_search_regex('<meta name="title" content="([^"]*)"',
+            webpage, u'player url').split(' : ')[-1]
  
          configUrl = self._search_regex('config=(.*)$', playerUrl, u'config url')
          configUrl = compat_urllib_parse.unquote(configUrl)
  
-        configJSON = self._download_webpage(configUrl, showName,
+        configJSON = self._download_webpage(configUrl, videoId,
                                              u'Downloading configuration',
                                              u'unable to download configuration')
  
@@ -2407,7 +2413,7 @@ class EscapistIE(InfoExtractor):
              'url': videoUrl,
              'uploader': showName,
              'upload_date': None,
-            'title': showName,
+            'title': title,
              'ext': 'mp4',
              'thumbnail': imgUrl,
              'description': videoDesc,
@@ -2500,7 +2506,7 @@ class XVideosIE(InfoExtractor):
              webpage, u'video URL'))
  
          # Extract title
-        video_title = self._search_regex(r'<title>(.*?)\s+-\s+XVID',
+        video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XVID',
              webpage, u'title')
  
          # Extract video thumbnail
@@ -2666,7 +2672,7 @@ class InfoQIE(InfoExtractor):
              webpage, u'title')
  
          # Extract description
-        video_description = self._search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
+        video_description = self._html_search_regex(r'<meta name="description" content="(.*)"(?:\s*/)?>',
              webpage, u'description', fatal=False)
  
          video_filename = video_url.split('/')[-1]
@@ -2838,12 +2844,10 @@ class StanfordOpenClassroomIE(InfoExtractor):
                                          note='Downloading course info page',
                                          errnote='Unable to download course info page')
  
-            info['title'] = self._search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
-            info['title'] = unescapeHTML(info['title'])
+            info['title'] = self._html_search_regex('<h1>([^<]+)</h1>', coursepage, 'title', default=info['id'])
  
-            info['description'] = self._search_regex('<description>([^<]+)</description>',
+            info['description'] = self._html_search_regex('<description>([^<]+)</description>',
                  coursepage, u'description', fatal=False)
-            if info['description']: info['description'] = unescapeHTML(info['description'])
  
              links = orderedSet(re.findall('<a href="(VideoPage.php\?[^"]+)">', coursepage))
              info['list'] = [
@@ -2904,15 +2908,13 @@ class MTVIE(InfoExtractor):
  
          webpage = self._download_webpage(url, video_id)
  
-        song_name = self._search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
+        song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
              webpage, u'song name', fatal=False)
-        if song_name: song_name = unescapeHTML(song_name)
  
-        video_title = self._search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
+        video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
              webpage, u'title')
-        video_title = unescapeHTML(video_title)
  
-        mtvn_uri = self._search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
+        mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
              webpage, u'mtvn_uri', fatal=False)
  
          content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
@@ -3068,7 +3070,7 @@ class XNXXIE(InfoExtractor):
              webpage, u'video URL')
          video_url = compat_urllib_parse.unquote(video_url)
  
-        video_title = self._search_regex(self.VIDEO_TITLE_RE,
+        video_title = self._html_search_regex(self.VIDEO_TITLE_RE,
              webpage, u'title')
  
          video_thumbnail = self._search_regex(self.VIDEO_THUMB_RE,
@@ -3109,7 +3111,7 @@ class GooglePlusIE(InfoExtractor):
          self.report_extraction(video_id)
  
          # Extract update date
-        upload_date = self._search_regex('title="Timestamp">(.*?)</a>',
+        upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
              webpage, u'upload date', fatal=False)
          if upload_date:
              # Convert timestring to a format suitable for filename
@@ -3117,12 +3119,12 @@ class GooglePlusIE(InfoExtractor):
              upload_date = upload_date.strftime('%Y%m%d')
  
          # Extract uploader
-        uploader = self._search_regex(r'rel\="author".*?>(.*?)</a>',
+        uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
              webpage, u'uploader', fatal=False)
  
          # Extract title
          # Get the first line for title
-        video_title = self._search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
+        video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
              webpage, 'title', default=u'NA')
  
          # Step 2, Stimulate clicking the image box to launch video
@@ -3161,7 +3163,7 @@ class GooglePlusIE(InfoExtractor):
          }]
  
  class NBAIE(InfoExtractor):
-    _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*)(\?.*)?$'
+    _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
      IE_NAME = u'nba'
  
      def _real_extract(self, url):
@@ -3170,27 +3172,26 @@ class NBAIE(InfoExtractor):
              raise ExtractorError(u'Invalid URL: %s' % url)
  
          video_id = mobj.group(1)
-        if video_id.endswith('/index.html'):
-            video_id = video_id[:-len('/index.html')]
  
          webpage = self._download_webpage(url, video_id)
  
          video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
  
          shortened_video_id = video_id.rpartition('/')[2]
-        title = self._search_regex(r'<meta property="og:title" content="(.*?)"',
+        title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
              webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
  
-        uploader_date = self._search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
+        # It isn't there in the HTML it returns to us
+        # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
  
-        description = self._search_regex(r'<div class="description">(.*?)</h1>', webpage, 'description', fatal=False)
+        description = self._html_search_regex(r'<meta name="description" (?:content|value)="(.*?)" />', webpage, 'description', fatal=False)
  
          info = {
              'id': shortened_video_id,
              'url': video_url,
              'ext': 'mp4',
              'title': title,
-            'uploader_date': uploader_date,
+            # 'uploader_date': uploader_date,
              'description': description,
          }
          return [info]
@@ -3339,17 +3340,14 @@ class FunnyOrDieIE(InfoExtractor):
          video_id = mobj.group('id')
          webpage = self._download_webpage(url, video_id)
  
-        video_url = self._search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
+        video_url = self._html_search_regex(r'<video[^>]*>\s*<source[^>]*>\s*<source src="(?P<url>[^"]+)"',
              webpage, u'video URL', flags=re.DOTALL)
-        video_url = unescapeHTML(video_url)
  
-        title = self._search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
+        title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
              r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
-        title = clean_html(title)
  
-        video_description = self._search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
-            webpage, u'description', flags=re.DOTALL)
-        if video_description: video_description = unescapeHTML(video_description)
+        video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
+            webpage, u'description', fatal=False, flags=re.DOTALL)
  
          info = {
              'id': video_id,
@@ -3418,14 +3416,13 @@ class UstreamIE(InfoExtractor):
  
          self.report_extraction(video_id)
  
-        video_title = self._search_regex(r'data-title="(?P<title>.+)"',
+        video_title = self._html_search_regex(r'data-title="(?P<title>.+)"',
              webpage, u'title')
  
-        uploader = self._search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
+        uploader = self._html_search_regex(r'data-content-type="channel".*?>(?P<uploader>.*?)</a>',
              webpage, u'uploader', fatal=False, flags=re.DOTALL)
-        if uploader: uploader = unescapeHTML(uploader.strip())
  
-        thumbnail = self._search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
+        thumbnail = self._html_search_regex(r'<link rel="image_src" href="(?P<thumb>.*?)"',
              webpage, u'thumbnail', fatal=False)
  
          info = {
@@ -3456,11 +3453,11 @@ class WorldStarHipHopIE(InfoExtractor):
          else:
              ext = 'flv'
  
-        video_title = self._search_regex(r"<title>(.*)</title>",
+        video_title = self._html_search_regex(r"<title>(.*)</title>",
              webpage_src, u'title')
  
          # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
-        thumbnail = self._search_regex(r'rel="image_src" href="(.*)" />',
+        thumbnail = self._html_search_regex(r'rel="image_src" href="(.*)" />',
              webpage_src, u'thumbnail', fatal=False)
  
          if not thumbnail:
@@ -3541,19 +3538,22 @@ class YouPornIE(InfoExtractor):
          req.add_header('Cookie', 'age_verified=1')
          webpage = self._download_webpage(req, video_id)
  
-        # Get the video title
-        video_title = self._search_regex(r'<h1.*?>(?P<title>.*)</h1>',
-            webpage, u'title').strip()
-
-        # Get the video date
-        upload_date = self._search_regex(r'Date:</label>(?P<date>.*) </li>',
-            webpage, u'upload date', fatal=False)
-        if upload_date: upload_date = unified_strdate(upload_date.strip())
+        # Get JSON parameters
+        json_params = self._search_regex(r'var currentVideo = new Video\((.*)\);', webpage, u'JSON parameters')
+        try:
+            params = json.loads(json_params)
+        except:
+            raise ExtractorError(u'Invalid JSON')
  
-        # Get the video uploader
-        video_uploader = self._search_regex(r'Submitted:</label>(?P<uploader>.*)</li>',
-            webpage, u'uploader', fatal=False)
-        if video_uploader: video_uploader = clean_html(video_uploader.strip())
+        self.report_extraction(video_id)
+        try:
+            video_title = params['title']
+            upload_date = unified_strdate(params['release_date_f'])
+            video_description = params['description']
+            video_uploader = params['submitted_by']
+            thumbnail = params['thumbnails'][0]['image']
+        except KeyError:
+            raise ExtractorError('Missing JSON parameter: ' + sys.exc_info()[1])
  
          # Get all of the formats available
          DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>'
@@ -3582,19 +3582,18 @@ class YouPornIE(InfoExtractor):
              size = format[0]
              bitrate = format[1]
              format = "-".join( format )
-            title = u'%s-%s-%s' % (video_title, size, bitrate)
+            # title = u'%s-%s-%s' % (video_title, size, bitrate)
  
              formats.append({
                  'id': video_id,
                  'url': video_url,
                  'uploader': video_uploader,
                  'upload_date': upload_date,
-                'title': title,
+                'title': video_title,
                  'ext': extension,
                  'format': format,
-                'thumbnail': None,
-                'description': None,
-                'player_url': None
+                'thumbnail': thumbnail,
+                'description': video_description
              })
  
          if self._downloader.params.get('listformats', None):
@@ -3640,7 +3639,7 @@ class PornotubeIE(InfoExtractor):
  
          #Get the uploaded date
          VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by'
-        upload_date = self._search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
+        upload_date = self._html_search_regex(VIDEO_UPLOADED_RE, webpage, u'upload date', fatal=False)
          if upload_date: upload_date = unified_strdate(upload_date)
  
          info = {'id': video_id,
@@ -3668,7 +3667,7 @@ class YouJizzIE(InfoExtractor):
          webpage = self._download_webpage(url, video_id)
  
          # Get the video title
-        video_title = self._search_regex(r'<title>(?P<title>.*)</title>',
+        video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
              webpage, u'title').strip()
  
          # Get the embed page
@@ -3747,13 +3746,11 @@ class KeekIE(InfoExtractor):
          thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
          webpage = self._download_webpage(url, video_id)
  
-        video_title = self._search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
+        video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
              webpage, u'title')
-        video_title = unescapeHTML(video_title)
  
-        uploader = self._search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
+        uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
              webpage, u'uploader', fatal=False)
-        if uploader: uploader = clean_html(uploader)
  
          info = {
                  'id': video_id,
@@ -3907,9 +3904,8 @@ class SpiegelIE(InfoExtractor):
  
          webpage = self._download_webpage(url, video_id)
  
-        video_title = self._search_regex(r'<div class="module-title">(.*?)</div>',
+        video_title = self._html_search_regex(r'<div class="module-title">(.*?)</div>',
              webpage, u'title')
-        video_title = unescapeHTML(video_title)
  
          xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml'
          xml_code = self._download_webpage(xml_url, video_id,
@@ -3948,15 +3944,13 @@ class LiveLeakIE(InfoExtractor):
          video_url = self._search_regex(r'file: "(.*?)",',
              webpage, u'video URL')
  
-        video_title = self._search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
-            webpage, u'title')
-        video_title = unescapeHTML(video_title).replace('LiveLeak.com -', '').strip()
+        video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
+            webpage, u'title').replace('LiveLeak.com -', '').strip()
  
-        video_description = self._search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
+        video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
              webpage, u'description', fatal=False)
-        if video_description: video_description = unescapeHTML(video_description)
  
-        video_uploader = self._search_regex(r'By:.*?(\w+)</a>',
+        video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
              webpage, u'uploader', fatal=False)
  
          info = {
@@ -4009,6 +4003,64 @@ class ARDIE(InfoExtractor):
              info["url"] = stream["video_url"]
          return [info]
  
+class ZDFIE(InfoExtractor):
+    _VALID_URL = r'^http://www\.zdf\.de\/ZDFmediathek\/(.*beitrag\/video\/)(?P<video_id>[^/\?]+)(?:\?.*)?'
+    _TITLE = r'<h1(?: class="beitragHeadline")?>(?P<title>.*)</h1>'
+    _MEDIA_STREAM = r'<a href="(?P<video_url>.+(?P<media_type>.streaming).+/zdf/(?P<quality>[^\/]+)/[^"]*)".+class="play".+>'
+    _MMS_STREAM = r'href="(?P<video_url>mms://[^"]*)"'
+    _RTSP_STREAM = r'(?P<video_url>rtsp://[^"]*.mp4)'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        if mobj is None:
+            raise ExtractorError(u'Invalid URL: %s' % url)
+        video_id = mobj.group('video_id')
+
+        html = self._download_webpage(url, video_id)
+        streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)]
+        if streams is None:
+            raise ExtractorError(u'No media url found.')
+
+        # s['media_type'] == 'wstreaming' -> use 'Windows Media Player' and mms url
+        # s['media_type'] == 'hstreaming' -> use 'Quicktime' and rtsp url
+        # choose first/default media type and highest quality for now
+        for s in streams:        #find 300 - dsl1000mbit
+            if s['quality'] == '300' and s['media_type'] == 'wstreaming':
+                stream_=s
+                break
+        for s in streams:        #find veryhigh - dsl2000mbit
+            if s['quality'] == 'veryhigh' and s['media_type'] == 'wstreaming': # 'hstreaming' - rtsp is not working
+                stream_=s
+                break
+        if stream_ is None:
+            raise ExtractorError(u'No stream found.')
+
+        media_link = self._download_webpage(stream_['video_url'], video_id,'Get stream URL')
+
+        self.report_extraction(video_id)
+        mobj = re.search(self._TITLE, html)
+        if mobj is None:
+            raise ExtractorError(u'Cannot extract title')
+        title = unescapeHTML(mobj.group('title'))
+
+        mobj = re.search(self._MMS_STREAM, media_link)
+        if mobj is None:
+            mobj = re.search(self._RTSP_STREAM, media_link)
+            if mobj is None:
+                raise ExtractorError(u'Cannot extract mms:// or rtsp:// URL')
+        mms_url = mobj.group('video_url')
+
+        mobj = re.search('(.*)[.](?P<ext>[^.]+)', mms_url)
+        if mobj is None:
+            raise ExtractorError(u'Cannot extract extention')
+        ext = mobj.group('ext')
+
+        return [{'id': video_id,
+                 'url': mms_url,
+                 'title': title,
+                 'ext': ext
+                 }]
+
  class TumblrIE(InfoExtractor):
      _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)'
  
@@ -4033,9 +4085,8 @@ class TumblrIE(InfoExtractor):
  
          # The only place where you can get a title, it's not complete,
          # but searching in other places doesn't work for all videos
-        video_title = self._search_regex(r'<title>(?P<title>.*?)</title>',
+        video_title = self._html_search_regex(r'<title>(?P<title>.*?)</title>',
              webpage, u'title', flags=re.DOTALL)
-        video_title = unescapeHTML(video_title)
  
          return [{'id': video_id,
                   'url': video_url,
@@ -4105,10 +4156,10 @@ class RedTubeIE(InfoExtractor):
  
          self.report_extraction(video_id)
  
-        video_url = self._search_regex(r'<source src="(.+?)" type="video/mp4">',
+        video_url = self._html_search_regex(r'<source src="(.+?)" type="video/mp4">',
              webpage, u'video URL')
  
-        video_title = self._search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
+        video_title = self._html_search_regex('<h1 class="videoTitle slidePanelMovable">(.+?)</h1>',
              webpage, u'title')
  
          return [{
@@ -4132,7 +4183,7 @@ class InaIE(InfoExtractor):
  
          self.report_extraction(video_id)
  
-        video_url = self._search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
+        video_url = self._html_search_regex(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)',
              webpage, u'video URL')
  
          video_title = self._search_regex(r'<title><!\[CDATA\[(?P<titre>.*?)]]></title>',
@@ -4161,13 +4212,13 @@ class HowcastIE(InfoExtractor):
          video_url = self._search_regex(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)',
              webpage, u'video URL')
  
-        video_title = self._search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
+        video_title = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') property=\'og:title\'',
              webpage, u'title')
  
-        video_description = self._search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
+        video_description = self._html_search_regex(r'<meta content=(?:"([^"]+)"|\'([^\']+)\') name=\'description\'',
              webpage, u'description', fatal=False)
  
-        thumbnail = self._search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
+        thumbnail = self._html_search_regex(r'<meta content=\'(.+?)\' property=\'og:image\'',
              webpage, u'thumbnail', fatal=False)
  
          return [{
@@ -4192,16 +4243,16 @@ class VineIE(InfoExtractor):
  
          self.report_extraction(video_id)
  
-        video_url = self._search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
+        video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
              webpage, u'video URL')
  
-        video_title = self._search_regex(r'<meta property="og:title" content="(.+?)"',
+        video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
              webpage, u'title')
  
-        thumbnail = self._search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
+        thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
              webpage, u'thumbnail', fatal=False)
  
-        uploader = self._search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
+        uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
              webpage, u'uploader', fatal=False, flags=re.DOTALL)
  
          return [{
@@ -4230,7 +4281,7 @@ class FlickrIE(InfoExtractor):
          first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self'
          first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage')
  
-        node_id = self._search_regex(r'<Item id="id">(\d+-\d+)</Item>',
+        node_id = self._html_search_regex(r'<Item id="id">(\d+-\d+)</Item>',
              first_xml, u'node_id')
  
          second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1'
@@ -4243,13 +4294,13 @@ class FlickrIE(InfoExtractor):
              raise ExtractorError(u'Unable to extract video url')
          video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
  
-        video_title = self._search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
+        video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
              webpage, u'video title')
  
-        video_description = self._search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
+        video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
              webpage, u'description', fatal=False)
  
-        thumbnail = self._search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
+        thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
              webpage, u'thumbnail', fatal=False)
  
          return [{
@@ -4272,24 +4323,24 @@ class TeamcocoIE(InfoExtractor):
          url_title = mobj.group('url_title')
          webpage = self._download_webpage(url, url_title)
  
-        video_id = self._search_regex(r'<article class="video" data-id="(\d+?)"',
+        video_id = self._html_search_regex(r'<article class="video" data-id="(\d+?)"',
              webpage, u'video id')
  
          self.report_extraction(video_id)
  
-        video_title = self._search_regex(r'<meta property="og:title" content="(.+?)"',
+        video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
              webpage, u'title')
  
-        thumbnail = self._search_regex(r'<meta property="og:image" content="(.+?)"',
+        thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
              webpage, u'thumbnail', fatal=False)
  
-        video_description = self._search_regex(r'<meta property="og:description" content="(.*?)"',
+        video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
              webpage, u'description', fatal=False)
  
          data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
          data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
  
-        video_url = self._search_regex(r'<file type="high".*?>(.*?)</file>',
+        video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
              data, u'video URL')
  
          return [{
@@ -4300,7 +4351,7 @@ class TeamcocoIE(InfoExtractor):
              'thumbnail':   thumbnail,
              'description': video_description,
          }]
-        
+
  class XHamsterIE(InfoExtractor):
      """Information Extractor for xHamster"""
      _VALID_URL = r'(?:http://)?(?:www.)?xhamster\.com/movies/(?P<id>[0-9]+)/.*\.html'
@@ -4309,8 +4360,9 @@ class XHamsterIE(InfoExtractor):
          mobj = re.match(self._VALID_URL, url)
  
          video_id = mobj.group('id')
-        mrss_url='http://xhamster.com/movies/%s/.html' % video_id
+        mrss_url = 'http://xhamster.com/movies/%s/.html' % video_id
          webpage = self._download_webpage(mrss_url, video_id)
+
          mobj = re.search(r'\'srv\': \'(?P<server>[^\']*)\',\s*\'file\': \'(?P<file>[^\']+)\',', webpage)
          if mobj is None:
              raise ExtractorError(u'Unable to extract media URL')
@@ -4320,39 +4372,33 @@ class XHamsterIE(InfoExtractor):
              video_url = mobj.group('server')+'/key='+mobj.group('file')
          video_extension = video_url.split('.')[-1]
  
-        mobj = re.search(r'<title>(?P<title>.+?) - xHamster\.com</title>', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract title')
-        video_title = unescapeHTML(mobj.group('title'))
+        video_title = self._html_search_regex(r'<title>(?P<title>.+?) - xHamster\.com</title>',
+            webpage, u'title')
  
-        mobj = re.search(r'<span>Description: </span>(?P<description>[^<]+)', webpage)
-        if mobj is None:
-            video_description = u''
-        else:
-            video_description = unescapeHTML(mobj.group('description'))
+        # Can't see the description anywhere in the UI
+        # video_description = self._html_search_regex(r'<span>Description: </span>(?P<description>[^<]+)',
+        #     webpage, u'description', fatal=False)
+        # if video_description: video_description = unescapeHTML(video_description)
  
          mobj = re.search(r'hint=\'(?P<upload_date_Y>[0-9]{4})-(?P<upload_date_m>[0-9]{2})-(?P<upload_date_d>[0-9]{2}) [0-9]{2}:[0-9]{2}:[0-9]{2} [A-Z]{3,4}\'', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract upload date')
-        video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
-
-        mobj = re.search(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^>]+)', webpage)
-        if mobj is None:
-            video_uploader_id = u'anonymous'
+        if mobj:
+            video_upload_date = mobj.group('upload_date_Y')+mobj.group('upload_date_m')+mobj.group('upload_date_d')
          else:
-            video_uploader_id = mobj.group('uploader_id')
+            video_upload_date = None
+            self._downloader.report_warning(u'Unable to extract upload date')
  
-        mobj = re.search(r'\'image\':\'(?P<thumbnail>[^\']+)\'', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract thumbnail URL')
-        video_thumbnail = mobj.group('thumbnail')
+        video_uploader_id = self._html_search_regex(r'<a href=\'/user/[^>]+>(?P<uploader_id>[^<]+)',
+            webpage, u'uploader id', default=u'anonymous')
+
+        video_thumbnail = self._search_regex(r'\'image\':\'(?P<thumbnail>[^\']+)\'',
+            webpage, u'thumbnail', fatal=False)
  
          return [{
              'id':       video_id,
              'url':      video_url,
              'ext':      video_extension,
              'title':    video_title,
-            'description': video_description,
+            # 'description': video_description,
              'upload_date': video_upload_date,
              'uploader_id': video_uploader_id,
              'thumbnail': video_thumbnail
@@ -4376,10 +4422,9 @@ class HypemIE(InfoExtractor):
          cookie = urlh.headers.get('Set-Cookie', '')
  
          self.report_extraction(track_id)
-        mobj = re.search(r'<script type="application/json" id="displayList-data">(.*?)</script>', response, flags=re.MULTILINE|re.DOTALL)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extrack tracks')
-        html_tracks = mobj.group(1).strip()
+
+        html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>',
+            response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip()
          try:
              track_list = json.loads(html_tracks)
              track = track_list[u'tracks'][0]
@@ -4409,6 +4454,92 @@ class HypemIE(InfoExtractor):
              'artist':   artist,
          }]
  
+class Vbox7IE(InfoExtractor):
+    """Information Extractor for Vbox7"""
+    _VALID_URL = r'(?:http://)?(?:www\.)?vbox7\.com/play:([^/]+)'
+
+    def _real_extract(self,url):
+        mobj = re.match(self._VALID_URL, url)
+        if mobj is None:
+            raise ExtractorError(u'Invalid URL: %s' % url)
+        video_id = mobj.group(1)
+
+        redirect_page, urlh = self._download_webpage_handle(url, video_id)
+        new_location = self._search_regex(r'window\.location = \'(.*)\';', redirect_page, u'redirect location')
+        redirect_url = urlh.geturl() + new_location
+        webpage = self._download_webpage(redirect_url, video_id, u'Downloading redirect page')
+
+        title = self._html_search_regex(r'<title>(.*)</title>',
+            webpage, u'title').split('/')[0].strip()
+
+        ext = "flv"
+        info_url = "http://vbox7.com/play/magare.do"
+        data = compat_urllib_parse.urlencode({'as3':'1','vid':video_id})
+        info_request = compat_urllib_request.Request(info_url, data)
+        info_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+        info_response = self._download_webpage(info_request, video_id, u'Downloading info webpage')
+        if info_response is None:
+            raise ExtractorError(u'Unable to extract the media url')
+        (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&'))
+
+        return [{
+            'id':        video_id,
+            'url':       final_url,
+            'ext':       ext,
+            'title':     title,
+            'thumbnail': thumbnail_url,
+        }]
+
+class GametrailersIE(InfoExtractor):
+    _VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        if mobj is None:
+            raise ExtractorError(u'Invalid URL: %s' % url)
+        video_id = mobj.group('id')
+        video_type = mobj.group('type')
+        webpage = self._download_webpage(url, video_id)
+        if video_type == 'full-episodes':
+            mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
+        else:
+            mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
+        mgid = self._search_regex(mgid_re, webpage, u'mgid')
+        data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
+
+        info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
+                                           video_id, u'Downloading video info')
+        links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
+                                               video_id, u'Downloading video urls info')
+
+        self.report_extraction(video_id)
+        info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
+                      <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
+                      <image>.*
+                        <url>(?P<thumb>.*?)</url>.*
+                      </image>'''
+
+        m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
+        if m_info is None:
+            raise ExtractorError(u'Unable to extract video info')
+        video_title = m_info.group('title')
+        video_description = m_info.group('description')
+        video_thumb = m_info.group('thumb')
+
+        m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
+        if m_urls is None or len(m_urls) == 0:
+            raise ExtractError(u'Unable to extrat video url')
+        # They are sorted from worst to best quality
+        video_url = m_urls[-1].group('url')
+
+        return {'url':         video_url,
+                'id':          video_id,
+                'title':       video_title,
+                # Videos are actually flv not mp4
+                'ext':         'flv',
+                'thumbnail':   video_thumb,
+                'description': video_description,
+                }
  
  def gen_extractors():
      """ Return a list of an instance of every supported extractor.
@@ -4463,6 +4594,7 @@ def gen_extractors():
          SpiegelIE(),
          LiveLeakIE(),
          ARDIE(),
+        ZDFIE(),
          TumblrIE(),
          BandcampIE(),
          RedTubeIE(),
@@ -4473,6 +4605,8 @@ def gen_extractors():
          TeamcocoIE(),
          XHamsterIE(),
          HypemIE(),
+        Vbox7IE(),
+        GametrailersIE(),
          GenericIE()
      ]