Move G+ IE into its own file, and move google search into a more descriptive module
authorPhilipp Hagemeister <phihag@phihag.de>
Sun, 23 Jun 2013 18:55:15 +0000 (20:55 +0200)
committerPhilipp Hagemeister <phihag@phihag.de>
Sun, 23 Jun 2013 18:55:15 +0000 (20:55 +0200)
youtube_dl/InfoExtractors.py
youtube_dl/extractor/google.py [deleted file]
youtube_dl/extractor/googleplus.py [new file with mode: 0644]
youtube_dl/extractor/googlesearch.py [new file with mode: 0644]

index 7c253bf9fac3b3373408b809b7e4a939fa2ed4c7..ca1e43404122ff05c750830b6667e686a7fc65c8 100755 (executable)
@@ -25,7 +25,8 @@ from .extractor.comedycentral import ComedyCentralIE
 from .extractor.dailymotion import DailymotionIE
 from .extractor.gametrailers import GametrailersIE
 from .extractor.generic import GenericIE
-from .extractor.google import GoogleSearchIE
+from .extractor.googleplus import GooglePlusIE
+from .extractor.googlesearch import GoogleSearchIE
 from .extractor.metacafe import MetacafeIE
 from .extractor.myvideo import MyVideoIE
 from .extractor.statigram import StatigramIE
@@ -926,79 +927,6 @@ class XNXXIE(InfoExtractor):
         }]
 
 
-class GooglePlusIE(InfoExtractor):
-    """Information extractor for plus.google.com."""
-
-    _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
-    IE_NAME = u'plus.google'
-
-    def _real_extract(self, url):
-        # Extract id from URL
-        mobj = re.match(self._VALID_URL, url)
-        if mobj is None:
-            raise ExtractorError(u'Invalid URL: %s' % url)
-
-        post_url = mobj.group(0)
-        video_id = mobj.group(1)
-
-        video_extension = 'flv'
-
-        # Step 1, Retrieve post webpage to extract further information
-        webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
-
-        self.report_extraction(video_id)
-
-        # Extract update date
-        upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
-            webpage, u'upload date', fatal=False)
-        if upload_date:
-            # Convert timestring to a format suitable for filename
-            upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
-            upload_date = upload_date.strftime('%Y%m%d')
-
-        # Extract uploader
-        uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
-            webpage, u'uploader', fatal=False)
-
-        # Extract title
-        # Get the first line for title
-        video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
-            webpage, 'title', default=u'NA')
-
-        # Step 2, Stimulate clicking the image box to launch video
-        video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
-            webpage, u'video page URL')
-        webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
-
-        # Extract video links on video page
-        """Extract video links of all sizes"""
-        pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
-        mobj = re.findall(pattern, webpage)
-        if len(mobj) == 0:
-            raise ExtractorError(u'Unable to extract video links')
-
-        # Sort in resolution
-        links = sorted(mobj)
-
-        # Choose the lowest of the sort, i.e. highest resolution
-        video_url = links[-1]
-        # Only get the url. The resolution part in the tuple has no use anymore
-        video_url = video_url[-1]
-        # Treat escaped \u0026 style hex
-        try:
-            video_url = video_url.decode("unicode_escape")
-        except AttributeError: # Python 3
-            video_url = bytes(video_url, 'ascii').decode('unicode-escape')
-
-
-        return [{
-            'id':       video_id,
-            'url':      video_url,
-            'uploader': uploader,
-            'upload_date':  upload_date,
-            'title':    video_title,
-            'ext':      video_extension,
-        }]
 
 class NBAIE(InfoExtractor):
     _VALID_URL = r'^(?:https?://)?(?:watch\.|www\.)?nba\.com/(?:nba/)?video(/[^?]*?)(?:/index\.html)?(?:\?.*)?$'
diff --git a/youtube_dl/extractor/google.py b/youtube_dl/extractor/google.py
deleted file mode 100644 (file)
index 21c240e..0000000
+++ /dev/null
@@ -1,39 +0,0 @@
-import itertools
-import re
-
-from .common import SearchInfoExtractor
-from ..utils import (
-    compat_urllib_parse,
-)
-
-
-class GoogleSearchIE(SearchInfoExtractor):
-    """Information Extractor for Google Video search queries."""
-    _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
-    _MAX_RESULTS = 1000
-    IE_NAME = u'video.google:search'
-    _SEARCH_KEY = 'gvsearch'
-
-    def _get_n_results(self, query, n):
-        """Get a specified number of results for a query"""
-
-        res = {
-            '_type': 'playlist',
-            'id': query,
-            'entries': []
-        }
-
-        for pagenum in itertools.count(1):
-            result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
-            webpage = self._download_webpage(result_url, u'gvsearch:' + query,
-                                             note='Downloading result page ' + str(pagenum))
-
-            for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
-                e = {
-                    '_type': 'url',
-                    'url': mobj.group(1)
-                }
-                res['entries'].append(e)
-
-            if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
-                return res
diff --git a/youtube_dl/extractor/googleplus.py b/youtube_dl/extractor/googleplus.py
new file mode 100644 (file)
index 0000000..e922bd1
--- /dev/null
@@ -0,0 +1,82 @@
+import datetime
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+)
+
+
+class GooglePlusIE(InfoExtractor):
+    """Information extractor for plus.google.com."""
+
+    _VALID_URL = r'(?:https://)?plus\.google\.com/(?:[^/]+/)*?posts/(\w+)'
+    IE_NAME = u'plus.google'
+
+    def _real_extract(self, url):
+        # Extract id from URL
+        mobj = re.match(self._VALID_URL, url)
+        if mobj is None:
+            raise ExtractorError(u'Invalid URL: %s' % url)
+
+        post_url = mobj.group(0)
+        video_id = mobj.group(1)
+
+        video_extension = 'flv'
+
+        # Step 1, Retrieve post webpage to extract further information
+        webpage = self._download_webpage(post_url, video_id, u'Downloading entry webpage')
+
+        self.report_extraction(video_id)
+
+        # Extract update date
+        upload_date = self._html_search_regex('title="Timestamp">(.*?)</a>',
+            webpage, u'upload date', fatal=False)
+        if upload_date:
+            # Convert timestring to a format suitable for filename
+            upload_date = datetime.datetime.strptime(upload_date, "%Y-%m-%d")
+            upload_date = upload_date.strftime('%Y%m%d')
+
+        # Extract uploader
+        uploader = self._html_search_regex(r'rel\="author".*?>(.*?)</a>',
+            webpage, u'uploader', fatal=False)
+
+        # Extract title
+        # Get the first line for title
+        video_title = self._html_search_regex(r'<meta name\=\"Description\" content\=\"(.*?)[\n<"]',
+            webpage, 'title', default=u'NA')
+
+        # Step 2, Stimulate clicking the image box to launch video
+        video_page = self._search_regex('"(https\://plus\.google\.com/photos/.*?)",,"image/jpeg","video"\]',
+            webpage, u'video page URL')
+        webpage = self._download_webpage(video_page, video_id, u'Downloading video page')
+
+        # Extract video links on video page
+        """Extract video links of all sizes"""
+        pattern = '\d+,\d+,(\d+),"(http\://redirector\.googlevideo\.com.*?)"'
+        mobj = re.findall(pattern, webpage)
+        if len(mobj) == 0:
+            raise ExtractorError(u'Unable to extract video links')
+
+        # Sort in resolution
+        links = sorted(mobj)
+
+        # Choose the lowest of the sort, i.e. highest resolution
+        video_url = links[-1]
+        # Only get the url. The resolution part in the tuple has no use anymore
+        video_url = video_url[-1]
+        # Treat escaped \u0026 style hex
+        try:
+            video_url = video_url.decode("unicode_escape")
+        except AttributeError: # Python 3
+            video_url = bytes(video_url, 'ascii').decode('unicode-escape')
+
+
+        return [{
+            'id':       video_id,
+            'url':      video_url,
+            'uploader': uploader,
+            'upload_date':  upload_date,
+            'title':    video_title,
+            'ext':      video_extension,
+        }]
diff --git a/youtube_dl/extractor/googlesearch.py b/youtube_dl/extractor/googlesearch.py
new file mode 100644 (file)
index 0000000..21c240e
--- /dev/null
@@ -0,0 +1,39 @@
+import itertools
+import re
+
+from .common import SearchInfoExtractor
+from ..utils import (
+    compat_urllib_parse,
+)
+
+
+class GoogleSearchIE(SearchInfoExtractor):
+    """Information Extractor for Google Video search queries."""
+    _MORE_PAGES_INDICATOR = r'id="pnnext" class="pn"'
+    _MAX_RESULTS = 1000
+    IE_NAME = u'video.google:search'
+    _SEARCH_KEY = 'gvsearch'
+
+    def _get_n_results(self, query, n):
+        """Get a specified number of results for a query"""
+
+        res = {
+            '_type': 'playlist',
+            'id': query,
+            'entries': []
+        }
+
+        for pagenum in itertools.count(1):
+            result_url = u'http://www.google.com/search?tbm=vid&q=%s&start=%s&hl=en' % (compat_urllib_parse.quote_plus(query), pagenum*10)
+            webpage = self._download_webpage(result_url, u'gvsearch:' + query,
+                                             note='Downloading result page ' + str(pagenum))
+
+            for mobj in re.finditer(r'<h3 class="r"><a href="([^"]+)"', webpage):
+                e = {
+                    '_type': 'url',
+                    'url': mobj.group(1)
+                }
+                res['entries'].append(e)
+
+            if (pagenum * 10 > n) or not re.search(self._MORE_PAGES_INDICATOR, webpage):
+                return res