[youtube] Fix extraction.
[youtube-dl] / youtube_dl / extractor / porncom.py
index 4baf79688ad73f0a6d0f51e05d1bb01ddc36ea39..5726cab3ae6763d7466d013898f71925966bed6d 100644 (file)
@@ -22,10 +22,12 @@ class PornComIE(InfoExtractor):
             'display_id': 'teen-grabs-a-dildo-and-fucks-her-pussy-live-on-1hottie-i-rec',
             'ext': 'mp4',
             'title': 'Teen grabs a dildo and fucks her pussy live on 1hottie, I rec',
-            'thumbnail': 're:^https?://.*\.jpg$',
+            'thumbnail': r're:^https?://.*\.jpg$',
             'duration': 551,
             'view_count': int,
             'age_limit': 18,
+            'categories': list,
+            'tags': list,
         },
     }, {
         'url': 'http://se.porn.com/videos/marsha-may-rides-seth-on-top-of-his-thick-cock-2658067',
@@ -41,7 +43,8 @@ class PornComIE(InfoExtractor):
 
         config = self._parse_json(
             self._search_regex(
-                r'=\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*=',
+                (r'=\s*({.+?})\s*;\s*v1ar\b',
+                 r'=\s*({.+?})\s*,\s*[\da-zA-Z_]+\s*='),
                 webpage, 'config', default='{}'),
             display_id, transform_source=js_to_json, fatal=False)
 
@@ -67,7 +70,7 @@ class PornComIE(InfoExtractor):
                 'height': int(height),
                 'filesize_approx': parse_filesize(filesize),
             } for format_url, height, filesize in re.findall(
-                r'<a[^>]+href="(/download/[^"]+)">MPEG4 (\d+)p<span[^>]*>(\d+\s+[a-zA-Z]+)<',
+                r'<a[^>]+href="(/download/[^"]+)">[^<]*?(\d+)p<span[^>]*>(\d+\s*[a-zA-Z]+)<',
                 webpage)]
             thumbnail = None
             duration = None
@@ -75,7 +78,16 @@ class PornComIE(InfoExtractor):
         self._sort_formats(formats)
 
         view_count = str_to_int(self._search_regex(
-            r'class=["\']views["\'][^>]*><p>([\d,.]+)', webpage, 'view count'))
+            (r'Views:\s*</span>\s*<span>\s*([\d,.]+)',
+             r'class=["\']views["\'][^>]*><p>([\d,.]+)'), webpage,
+            'view count', fatal=False))
+
+        def extract_list(kind):
+            s = self._search_regex(
+                (r'(?s)%s:\s*</span>\s*<span>(.+?)</span>' % kind.capitalize(),
+                 r'(?s)<p[^>]*>%s:(.+?)</p>' % kind.capitalize()),
+                webpage, kind, fatal=False)
+            return re.findall(r'<a[^>]+>([^<]+)</a>', s or '')
 
         return {
             'id': video_id,
@@ -86,4 +98,6 @@ class PornComIE(InfoExtractor):
             'view_count': view_count,
             'formats': formats,
             'age_limit': 18,
+            'categories': extract_list('categories'),
+            'tags': extract_list('tags'),
         }