InfoExtractor: add some helper methods to extract OpenGraph info

author Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>

Fri, 12 Jul 2013 17:00:19 +0000 (19:00 +0200)

committer Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>

Fri, 12 Jul 2013 20:12:04 +0000 (22:12 +0200)
author Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
Fri, 12 Jul 2013 17:00:19 +0000 (19:00 +0200)
committer Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
Fri, 12 Jul 2013 20:12:04 +0000 (22:12 +0200)
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py

index 1bd5538ca2af7f2f8772b9e10eb9508588143258..0a0c4047d79314f497b13106ec105d2b525fb0f5 100644 (file)
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -257,6 +257,30 @@ class InfoExtractor(object):
          
          return (username, password)
  
+    # Helper functions for extracting OpenGraph info
+    @staticmethod
+    def _og_regex(property):
+        return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % property
+
+    def _og_search_property(self, property, html, name=None, **kargs):
+        if name is None:
+            name = 'OpenGraph %s' % property
+        return self._html_search_regex(self._og_regex(property), html, name, **kargs)
+
+    def _og_search_thumbnail(self, html, **kargs):
+        return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
+
+    def _og_search_description(self, html, **kargs):
+        return self._og_search_property('description', html, fatal=False, **kargs)
+
+    def _og_search_title(self, html, **kargs):
+        return self._og_search_property('title', html, **kargs)
+
+    def _og_search_video_url(self, html, name='video url', **kargs):
+        return self._html_search_regex([self._og_regex('video:secure_url'),
+                                        self._og_regex('video')],
+                                       html, name, **kargs)
+
  class SearchInfoExtractor(InfoExtractor):
      """
      Base class for paged search queries extractors.
diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py

index a4853279bbfc0bce6517a2b1d032ea1eafe07482..7bf03c584c7388b162c9b3912a4aa0f410ed5b22 100644 (file)
--- a/youtube_dl/extractor/cspan.py
+++ b/youtube_dl/extractor/cspan.py
@@ -34,8 +34,6 @@ class CSpanIE(InfoExtractor):
          description = self._html_search_regex(r'<meta (?:property="og:|name=")description" content="(.*?)"',
                                                webpage, 'description',
                                                flags=re.MULTILINE|re.DOTALL)
-        thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.*?)"',
-                                            webpage, 'thumbnail')
  
          url = self._search_regex(r'<string name="URL">(.*?)</string>',
                                   video_info, 'video url')
@@ -49,5 +47,5 @@ class CSpanIE(InfoExtractor):
                  'url': url,
                  'play_path': path,
                  'description': description,
-                'thumbnail': thumbnail,
+                'thumbnail': self._og_search_thumbnail(webpage),
                  }
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py

index 5fd2221a798403ff4832bf6992b8724bdf74f964..9bf7a28ca83248ac61d3cc64d98058568b98dde6 100644 (file)
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -39,9 +39,6 @@ class DailymotionIE(InfoExtractor):
          # Extract URL, uploader and title from webpage
          self.report_extraction(video_id)
  
-        video_title = self._html_search_regex(r'<meta property="og:title" content="(.*?)" />',
-                                              webpage, 'title')
-
          video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
                                               # Looking for official user
                                               r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
@@ -76,7 +73,7 @@ class DailymotionIE(InfoExtractor):
              'url':      video_url,
              'uploader': video_uploader,
              'upload_date':  video_upload_date,
-            'title':    video_title,
+            'title':    self._og_search_title(webpage),
              'ext':      video_extension,
              'thumbnail': info['thumbnail_url']
          }]
diff --git a/youtube_dl/extractor/ehow.py b/youtube_dl/extractor/ehow.py

index 1f0b3888e071bd663125a6ba11045fd29602b9a9..2bb77aec6cb0d9ae2a7b4c6301c6deefc4548c57 100644 (file)
--- a/youtube_dl/extractor/ehow.py
+++ b/youtube_dl/extractor/ehow.py
@@ -28,14 +28,9 @@ class EHowIE(InfoExtractor):
          video_url = self._search_regex(r'(?:file|source)=(http[^\'"&]*)',
              webpage, u'video URL')
          final_url = compat_urllib_parse.unquote(video_url)        
-        thumbnail_url = self._search_regex(r'<meta property="og:image" content="(.+?)" />',
-            webpage, u'thumbnail URL')
          uploader = self._search_regex(r'<meta name="uploader" content="(.+?)" />',
              webpage, u'uploader')
-        title = self._search_regex(r'<meta property="og:title" content="(.+?)" />',
-            webpage, u'Video title').replace(' | eHow', '')
-        description = self._search_regex(r'<meta property="og:description" content="(.+?)" />',
-            webpage, u'video description')
+        title = self._og_search_title(webpage).replace(' | eHow', '')
          ext = determine_ext(final_url)
  
          return {
@@ -44,8 +39,8 @@ class EHowIE(InfoExtractor):
              'url':         final_url,
              'ext':         ext,
              'title':       title,
-            'thumbnail':   thumbnail_url,
-            'description': description,
+            'thumbnail':   self._og_search_thumbnail(webpage),
+            'description': self._og_search_description(webpage),
              'uploader':    uploader,
          }
  
diff --git a/youtube_dl/extractor/escapist.py b/youtube_dl/extractor/escapist.py

index 794460e8459b65130b117b0806e4ef1630160685..3aa2da52c0117bc9926df9c250eeb70da6cc2299 100644 (file)
--- a/youtube_dl/extractor/escapist.py
+++ b/youtube_dl/extractor/escapist.py
@@ -36,11 +36,7 @@ class EscapistIE(InfoExtractor):
          videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
              webpage, u'description', fatal=False)
  
-        imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
-            webpage, u'thumbnail', fatal=False)
-
-        playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
-            webpage, u'player url')
+        playerUrl = self._og_search_video_url(webpage, name='player url')
  
          title = self._html_search_regex('<meta name="title" content="([^"]*)"',
              webpage, u'player url').split(' : ')[-1]
@@ -70,7 +66,7 @@ class EscapistIE(InfoExtractor):
              'upload_date': None,
              'title': title,
              'ext': 'mp4',
-            'thumbnail': imgUrl,
+            'thumbnail': self._og_search_thumbnail(webpage),
              'description': videoDesc,
              'player_url': playerUrl,
          }
diff --git a/youtube_dl/extractor/flickr.py b/youtube_dl/extractor/flickr.py

index bd97bff9a78a9098ae6e5a6d8aa8612683405012..80d96baf739522b97f933878faa8a4083a0e8959 100644 (file)
--- a/youtube_dl/extractor/flickr.py
+++ b/youtube_dl/extractor/flickr.py
@@ -47,21 +47,12 @@ class FlickrIE(InfoExtractor):
              raise ExtractorError(u'Unable to extract video url')
          video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
  
-        video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
-            webpage, u'video title')
-
-        video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
-            webpage, u'description', fatal=False)
-
-        thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
-            webpage, u'thumbnail', fatal=False)
-
          return [{
              'id':          video_id,
              'url':         video_url,
              'ext':         'mp4',
-            'title':       video_title,
-            'description': video_description,
-            'thumbnail':   thumbnail,
+            'title':       self._og_search_title(webpage),
+            'description': self._og_search_description(webpage),
+            'thumbnail':   self._og_search_thumbnail(webpage),
              'uploader_id': video_uploader_id,
          }]
diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py

index 388aacf2f1b513c0797bc92d27a8217e49628f08..64363dcd51d687fe749ead55f5fc0efc4c04d31c 100644 (file)
--- a/youtube_dl/extractor/funnyordie.py
+++ b/youtube_dl/extractor/funnyordie.py
@@ -27,14 +27,11 @@ class FunnyOrDieIE(InfoExtractor):
          title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
              r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
  
-        video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
-            webpage, u'description', fatal=False, flags=re.DOTALL)
-
          info = {
              'id': video_id,
              'url': video_url,
              'ext': 'mp4',
              'title': title,
-            'description': video_description,
+            'description': self._og_search_description(webpage, flags=re.DOTALL),
          }
          return [info]
diff --git a/youtube_dl/extractor/hotnewhiphop.py b/youtube_dl/extractor/hotnewhiphop.py

index ca3abb7d7fdaaf5c84869e1b4eda125d5076573a..ccca1d7e0bb41dae5694c2bd582728cc939b87da 100644 (file)
--- a/youtube_dl/extractor/hotnewhiphop.py
+++ b/youtube_dl/extractor/hotnewhiphop.py
@@ -33,16 +33,12 @@ class HotNewHipHopIE(InfoExtractor):
  
          video_title = self._html_search_regex(r"<title>(.*)</title>",
              webpage_src, u'title')
-        
-        # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
-        thumbnail = self._html_search_regex(r'"og:image" content="(.*)"',
-            webpage_src, u'thumbnail', fatal=False)
  
          results = [{
                      'id': video_id,
                      'url' : video_url,
                      'title' : video_title,
-                    'thumbnail' : thumbnail,
+                    'thumbnail' : self._og_search_thumbnail(webpage_src),
                      'ext' : 'mp3',
                      }]
-        return results
-\ No newline at end of file
+        return results
diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py

index 6ae704efddce7a1b636cc9bd81b5244bbad95b2d..1ffadf67f120f60d57ec367635701fc709232a4b 100644 (file)
--- a/youtube_dl/extractor/instagram.py
+++ b/youtube_dl/extractor/instagram.py
@@ -18,12 +18,6 @@ class InstagramIE(InfoExtractor):
          mobj = re.match(self._VALID_URL, url)
          video_id = mobj.group(1)
          webpage = self._download_webpage(url, video_id)
-        video_url = self._html_search_regex(
-            r'<meta property="og:video" content="(.+?)"',
-            webpage, u'video URL')
-        thumbnail_url = self._html_search_regex(
-            r'<meta property="og:image" content="(.+?)" />',
-            webpage, u'thumbnail URL', fatal=False)
          html_title = self._html_search_regex(
              r'<title>(.+?)</title>',
              webpage, u'title', flags=re.DOTALL)
@@ -34,9 +28,9 @@ class InstagramIE(InfoExtractor):
  
          return [{
              'id':        video_id,
-            'url':       video_url,
+            'url':       self._og_search_video_url(webpage),
              'ext':       ext,
              'title':     title,
-            'thumbnail': thumbnail_url,
+            'thumbnail': self._og_search_thumbnail(webpage),
              'uploader_id' : uploader_id
          }]
diff --git a/youtube_dl/extractor/keek.py b/youtube_dl/extractor/keek.py

index 72ad6a3d00b25f30f8d56e06bf5a15da32b8a911..dda78743d79fd0716be997be15c09d12f0c5dc9b 100644 (file)
--- a/youtube_dl/extractor/keek.py
+++ b/youtube_dl/extractor/keek.py
@@ -24,8 +24,7 @@ class KeekIE(InfoExtractor):
          thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
          webpage = self._download_webpage(url, video_id)
  
-        video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
-            webpage, u'title')
+        video_title = self._og_search_title(webpage)
  
          uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
              webpage, u'uploader', fatal=False)
diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py

index cf8a2c9312a53d8fe3f16b363cce31b2dd7c989d..dd062a14e736ba84b3aacb9d3bf426bca4c8f86f 100644 (file)
--- a/youtube_dl/extractor/liveleak.py
+++ b/youtube_dl/extractor/liveleak.py
@@ -33,11 +33,9 @@ class LiveLeakIE(InfoExtractor):
          video_url = self._search_regex(r'file: "(.*?)",',
              webpage, u'video URL')
  
-        video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
-            webpage, u'title').replace('LiveLeak.com -', '').strip()
+        video_title = self._og_search_title(webpage).replace('LiveLeak.com -', '').strip()
  
-        video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
-            webpage, u'description', fatal=False)
+        video_description = self._og_search_description(webpage)
  
          video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
              webpage, u'uploader', fatal=False)
diff --git a/youtube_dl/extractor/nba.py b/youtube_dl/extractor/nba.py

index 122b7dd2628e3b1cd43ffb9dbb67035047745c9f..0f178905bfe0b049499dd58f71df42da1c419639 100644 (file)
--- a/youtube_dl/extractor/nba.py
+++ b/youtube_dl/extractor/nba.py
@@ -30,8 +30,7 @@ class NBAIE(InfoExtractor):
          video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
  
          shortened_video_id = video_id.rpartition('/')[2]
-        title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
-            webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
+        title = self._og_search_title(webpage, default=shortened_video_id).replace('NBA.com: ', '')
  
          # It isn't there in the HTML it returns to us
          # uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
diff --git a/youtube_dl/extractor/statigram.py b/youtube_dl/extractor/statigram.py

index ae9a63e8b4e018c1cc3625aa8bc75fe37d62922a..b8e6b3bf91a05a793db631db9325bb2bc605b8b9 100644 (file)
--- a/youtube_dl/extractor/statigram.py
+++ b/youtube_dl/extractor/statigram.py
@@ -18,12 +18,6 @@ class StatigramIE(InfoExtractor):
          mobj = re.match(self._VALID_URL, url)
          video_id = mobj.group(1)
          webpage = self._download_webpage(url, video_id)
-        video_url = self._html_search_regex(
-            r'<meta property="og:video:secure_url" content="(.+?)">',
-            webpage, u'video URL')
-        thumbnail_url = self._html_search_regex(
-            r'<meta property="og:image" content="(.+?)" />',
-            webpage, u'thumbnail URL', fatal=False)
          html_title = self._html_search_regex(
              r'<title>(.+?)</title>',
              webpage, u'title')
@@ -34,9 +28,9 @@ class StatigramIE(InfoExtractor):
  
          return [{
              'id':        video_id,
-            'url':       video_url,
+            'url':       self._og_search_video_url(webpage),
              'ext':       ext,
              'title':     title,
-            'thumbnail': thumbnail_url,
+            'thumbnail': self._og_search_thumbnail(webpage),
              'uploader_id' : uploader_id
          }]
diff --git a/youtube_dl/extractor/teamcoco.py b/youtube_dl/extractor/teamcoco.py

index 1dd5e1b685e7aa99804d51d99a594945f30961a6..ec92e589a52c12cb3f88bc33861ade9752b779bd 100644 (file)
--- a/youtube_dl/extractor/teamcoco.py
+++ b/youtube_dl/extractor/teamcoco.py
@@ -30,15 +30,6 @@ class TeamcocoIE(InfoExtractor):
  
          self.report_extraction(video_id)
  
-        video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
-            webpage, u'title')
-
-        thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
-            webpage, u'thumbnail', fatal=False)
-
-        video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
-            webpage, u'description', fatal=False)
-
          data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
          data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
  
@@ -49,7 +40,7 @@ class TeamcocoIE(InfoExtractor):
              'id':          video_id,
              'url':         video_url,
              'ext':         'mp4',
-            'title':       video_title,
-            'thumbnail':   thumbnail,
-            'description': video_description,
+            'title':       self._og_search_title(webpage),
+            'thumbnail':   self._og_search_thumbnail(webpage),
+            'description': self._og_search_description(webpage),
          }]
diff --git a/youtube_dl/extractor/traileraddict.py b/youtube_dl/extractor/traileraddict.py

index 9dd26c1637f58e5a823a6cf75ba8fd4ee42f0750..324bb6231e635cc90c7f9161e137eec5fb491a33 100644 (file)
--- a/youtube_dl/extractor/traileraddict.py
+++ b/youtube_dl/extractor/traileraddict.py
@@ -24,11 +24,8 @@ class TrailerAddictIE(InfoExtractor):
                  webpage, 'video title').replace(' - Trailer Addict','')
          view_count = self._search_regex(r'Views: (.+?)<br />',
                  webpage, 'Views Count')
-        description = self._search_regex(r'<meta property="og:description" content="(.+?)" />',
-                webpage, 'video description')
-        video_id = self._search_regex(r'<meta property="og:video" content="(.+?)" />',
-                webpage, 'Video id').split('=')[1]
-        
+        video_id = self._og_search_property('video', webpage, 'Video id').split('=')[1]
+
          info_url = "http://www.traileraddict.com/fvar.php?tid=%s" %(str(video_id))
          info_webpage = self._download_webpage(info_url, video_id , "Downloading the info webpage")
          
@@ -44,6 +41,6 @@ class TrailerAddictIE(InfoExtractor):
              'ext'         : ext,
              'title'       : title,
              'thumbnail'   : thumbnail_url,
-            'description' : description,
+            'description' : self._og_search_description(webpage),
              'view_count'  : view_count,
          }]
diff --git a/youtube_dl/extractor/tutv.py b/youtube_dl/extractor/tutv.py

index fcaa6ac01af6d778e43aa7b35d92d3dcc9478911..4e404fbf5912fd32b695c701466309a38179e799 100644 (file)
--- a/youtube_dl/extractor/tutv.py
+++ b/youtube_dl/extractor/tutv.py
@@ -22,8 +22,6 @@ class TutvIE(InfoExtractor):
          video_id = mobj.group('id')
  
          webpage = self._download_webpage(url, video_id)
-        title = self._html_search_regex(
-            r'<meta property="og:title" content="(.*?)">', webpage, u'title')
          internal_id = self._search_regex(r'codVideo=([0-9]+)', webpage, u'internal video ID')
  
          data_url = u'http://tu.tv/flvurl.php?codVideo=' + str(internal_id)
@@ -36,6 +34,6 @@ class TutvIE(InfoExtractor):
              'id': internal_id,
              'url': video_url,
              'ext': ext,
-            'title': title,
+            'title': self._og_search_title(webpage),
          }
          return [info]
diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py

index bdd3522ebf5a7385a80c54a6e85c808d54346cc4..c4ec1f06ffe3ccce17598aeb319047f0890f9a02 100644 (file)
--- a/youtube_dl/extractor/vine.py
+++ b/youtube_dl/extractor/vine.py
@@ -27,12 +27,6 @@ class VineIE(InfoExtractor):
          video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
              webpage, u'video URL')
  
-        video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
-            webpage, u'title')
-
-        thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
-            webpage, u'thumbnail', fatal=False)
-
          uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
              webpage, u'uploader', fatal=False, flags=re.DOTALL)
  
@@ -40,7 +34,7 @@ class VineIE(InfoExtractor):
              'id':        video_id,
              'url':       video_url,
              'ext':       'mp4',
-            'title':     video_title,
-            'thumbnail': thumbnail,
+            'title':     self._og_search_title(webpage),
+            'thumbnail': self._og_search_thumbnail(webpage),
              'uploader':  uploader,
          }]
author	Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
	Fri, 12 Jul 2013 17:00:19 +0000 (19:00 +0200)
committer	Jaime Marquínez Ferrándiz <jaime.marquinez.ferrandiz@gmail.com>
	Fri, 12 Jul 2013 20:12:04 +0000 (22:12 +0200)
youtube_dl/extractor/common.py		patch \| blob \| history
youtube_dl/extractor/cspan.py		patch \| blob \| history
youtube_dl/extractor/dailymotion.py		patch \| blob \| history
youtube_dl/extractor/ehow.py		patch \| blob \| history
youtube_dl/extractor/escapist.py		patch \| blob \| history
youtube_dl/extractor/flickr.py		patch \| blob \| history
youtube_dl/extractor/funnyordie.py		patch \| blob \| history
youtube_dl/extractor/hotnewhiphop.py		patch \| blob \| history
youtube_dl/extractor/instagram.py		patch \| blob \| history
youtube_dl/extractor/keek.py		patch \| blob \| history
youtube_dl/extractor/liveleak.py		patch \| blob \| history
youtube_dl/extractor/nba.py		patch \| blob \| history
youtube_dl/extractor/statigram.py		patch \| blob \| history
youtube_dl/extractor/teamcoco.py		patch \| blob \| history
youtube_dl/extractor/traileraddict.py		patch \| blob \| history
youtube_dl/extractor/tutv.py		patch \| blob \| history
youtube_dl/extractor/vine.py		patch \| blob \| history