[toypics] Separate user and video extraction (#2601)
authorPhilipp Hagemeister <phihag@phihag.de>
Sat, 22 Mar 2014 14:15:01 +0000 (15:15 +0100)
committerPhilipp Hagemeister <phihag@phihag.de>
Sat, 22 Mar 2014 14:15:01 +0000 (15:15 +0100)
test/test_playlists.py
youtube_dl/extractor/__init__.py
youtube_dl/extractor/toypics.py

index fbeed1c8cde5c02f100ca50cbe42e20203fe0155..4c9c34057dd535ac2cd4c4b0148917ab3c440237 100644 (file)
@@ -37,6 +37,7 @@ from youtube_dl.extractor import (
     GoogleSearchIE,
     GenericIE,
     TEDIE,
     GoogleSearchIE,
     GenericIE,
     TEDIE,
+    ToypicsUserIE,
 )
 
 
 )
 
 
@@ -269,5 +270,13 @@ class TestPlaylists(unittest.TestCase):
         self.assertEqual(result['title'], 'Who are the hackers?')
         self.assertTrue(len(result['entries']) >= 6)
 
         self.assertEqual(result['title'], 'Who are the hackers?')
         self.assertTrue(len(result['entries']) >= 6)
 
+    def test_toypics_user(self):
+        dl = FakeYDL()
+        ie = ToypicsUserIE(dl)
+        result = ie.extract('http://videos.toypics.net/Mikey')
+        self.assertIsPlaylist(result)
+        self.assertEqual(result['id'], 'Mikey')
+        self.assertTrue(len(result['entries']) >= 17)
+
 if __name__ == '__main__':
     unittest.main()
 if __name__ == '__main__':
     unittest.main()
index 5ca6eb16cb370434dea85b3f57a6f3bf365cf686..b8c843515d48341862c446fa004b277f37d5f9db 100644 (file)
@@ -239,7 +239,7 @@ from .theplatform import ThePlatformIE
 from .thisav import ThisAVIE
 from .tinypic import TinyPicIE
 from .toutv import TouTvIE
 from .thisav import ThisAVIE
 from .tinypic import TinyPicIE
 from .toutv import TouTvIE
-from .toypics import ToypicsIE
+from .toypics import ToypicsUserIE, ToypicsIE
 from .traileraddict import TrailerAddictIE
 from .trilulilu import TriluliluIE
 from .trutube import TruTubeIE
 from .traileraddict import TrailerAddictIE
 from .trilulilu import TriluliluIE
 from .trutube import TruTubeIE
index 3cbfe2e7e6b0fc00016b06eb99f1f0d316ed11f1..33a6988cacdc36a0c18ad89db80d50f21cd74219 100644 (file)
@@ -2,43 +2,26 @@ from .common import InfoExtractor
 from math import ceil
 import re
 
 from math import ceil
 import re
 
+
 class ToypicsIE(InfoExtractor):
 class ToypicsIE(InfoExtractor):
-    _VALID_URL = r'(?:http://)?videos\.toypics\.net/.*'
+    IE_DESC = 'Toypics user profile'
+    _VALID_URL = r'http://videos\.toypics\.net/view/(?P<id>[0-9]+)/.*'
     _TEST = {
         'url': 'http://videos.toypics.net/view/514/chancebulged,-2-1/',
     _TEST = {
         'url': 'http://videos.toypics.net/view/514/chancebulged,-2-1/',
-        #'md5': '8a8b546956bbd0e769dbe28f6e80abb3', == $head -c10K 12929646011616163504.mp4 |md5sum //no idea why it fails
+        'md5': '16e806ad6d6f58079d210fe30985e08b',
         'info_dict': {
             'id': '514',
             'ext': 'mp4',
             'title': 'Chance-Bulge\'d, 2',
         'info_dict': {
             'id': '514',
             'ext': 'mp4',
             'title': 'Chance-Bulge\'d, 2',
-            'age_limit': 18
+            'age_limit': 18,
+            'uploader': 'kidsune',
         }
     }
         }
     }
-    PAGINATED=8
 
     def _real_extract(self, url):
 
     def _real_extract(self, url):
-        mobj = re.match(r'(http://)?videos\.toypics\.net/(?P<username>[^/?]+)$', url)
-        if not mobj:
-            return self.extract_one(url)
-        return [self.extract_one(u) for u in self.process_paginated(url,
-            r'public/">Public Videos \((?P<videos_count>[0-9]+)\)</a></li>',
-            r'<p class="video-entry-title">\n\s*<a href="(http://videos.toypics.net/view/[^"]+)">'
-        )]
-
-    def process_paginated(self, profile_url, re_total, re_video_page):
-        profile_page = self._download_webpage(profile_url, 'profile' , 'getting profile page: '+profile_url)
-        videos_count = self._html_search_regex(re_total, profile_page, 'videos count')
-        lst = []
-        for n in xrange(1,int(ceil(float(videos_count)/self.PAGINATED)) +1):
-            lpage_url = profile_url +'/public/%d'%n
-            lpage = self._download_webpage(lpage_url, 'page %d'%n)
-            lst.extend(re.findall(re_video_page, lpage))
-        return lst
-
-    def extract_one(self,url):
-        mobj = re.match(r'(http://)?videos\.toypics\.net/view/(?P<videoid>[0-9]+)/.*', url)
-        video_id = mobj.group('videoid')
-        page = self._download_webpage(url, video_id, 'getting page: '+url)
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        page = self._download_webpage(url, video_id)
         video_url = self._html_search_regex(
             r'src:\s+"(http://static[0-9]+\.toypics\.net/flvideo/[^"]+)"', page, 'video URL')
         title = self._html_search_regex(
         video_url = self._html_search_regex(
             r'src:\s+"(http://static[0-9]+\.toypics\.net/flvideo/[^"]+)"', page, 'video URL')
         title = self._html_search_regex(
@@ -48,8 +31,46 @@ class ToypicsIE(InfoExtractor):
         return {
             'id': video_id,
             'url': video_url,
         return {
             'id': video_id,
             'url': video_url,
-            'ext': video_url[-3:],
             'title': title,
             'uploader': username,
             'title': title,
             'uploader': username,
-            'age_limit': 18
+            'age_limit': 18,
+        }
+
+
+class ToypicsUserIE(InfoExtractor):
+    IE_DESC = 'Toypics user profile'
+    _VALID_URL = r'http://videos\.toypics\.net/(?P<username>[^/?]+)(?:$|[?#])'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        username = mobj.group('username')
+
+        profile_page = self._download_webpage(
+            url, username, note='Retrieving profile page')
+
+        video_count = int(self._search_regex(
+            r'public/">Public Videos \(([0-9]+)\)</a></li>', profile_page,
+            'video count'))
+
+        PAGE_SIZE = 8
+        urls = []
+        page_count = (video_count + PAGE_SIZE + 1) // PAGE_SIZE
+        for n in range(1, page_count + 1):
+            lpage_url = url + '/public/%d' % n
+            lpage = self._download_webpage(
+                lpage_url, username,
+                note='Downloading page %d/%d' % (n, page_count))
+            urls.extend(
+                re.findall(
+                    r'<p class="video-entry-title">\n\s*<a href="(http://videos.toypics.net/view/[^"]+)">',
+                    lpage))
+
+        return {
+            '_type': 'playlist',
+            'id': username,
+            'entries': [{
+                '_type': 'url',
+                'url': url,
+                'ie_key': 'Toypics',
+            } for url in urls]
         }
         }