Merge branch 'master' of https://github.com/DarkstaIkers/youtube-dl into DarkstaIkers...
[youtube-dl] / youtube_dl / extractor / youjizz.py
index d9efac76eb7707d622fc3c4e499e380521cd40fb..b50f34e9bb30e47c679940ca1577ea8cc6683934 100644 (file)
@@ -1,45 +1,39 @@
-import re
+from __future__ import unicode_literals
 
 from .common import InfoExtractor
-from ..utils import (
-    ExtractorError,
-)
 
 
 class YouJizzIE(InfoExtractor):
-    _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$'
+    _VALID_URL = r'https?://(?:\w+\.)?youjizz\.com/videos/(?:[^/#?]+)?-(?P<id>[0-9]+)\.html(?:$|[?#])'
+    _TESTS = [{
+        'url': 'http://www.youjizz.com/videos/zeichentrick-1-2189178.html',
+        'md5': '78fc1901148284c69af12640e01c6310',
+        'info_dict': {
+            'id': '2189178',
+            'ext': 'mp4',
+            'title': 'Zeichentrick 1',
+            'age_limit': 18,
+        }
+    }, {
+        'url': 'http://www.youjizz.com/videos/-2189178.html',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-
-        video_id = mobj.group('videoid')
-
-        # Get webpage content
+        video_id = self._match_id(url)
         webpage = self._download_webpage(url, video_id)
+        # YouJizz's HTML5 player has invalid HTML
+        webpage = webpage.replace('"controls', '" controls')
+        age_limit = self._rta_search(webpage)
+        video_title = self._html_search_regex(
+            r'<title>\s*(.*)\s*</title>', webpage, 'title')
 
-        # Get the video title
-        video_title = self._html_search_regex(r'<title>(?P<title>.*)</title>',
-            webpage, u'title').strip()
-
-        # Get the embed page
-        result = re.search(r'https?://www.youjizz.com/videos/embed/(?P<videoid>[0-9]+)', webpage)
-        if result is None:
-            raise ExtractorError(u'ERROR: unable to extract embed page')
-
-        embed_page_url = result.group(0).strip()
-        video_id = result.group('videoid')
-
-        webpage = self._download_webpage(embed_page_url, video_id)
-
-        # Get the video URL
-        video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
-            webpage, u'video URL')
+        info_dict = self._parse_html5_media_entries(url, webpage, video_id)[0]
 
-        info = {'id': video_id,
-                'url': video_url,
-                'title': video_title,
-                'ext': 'flv',
-                'format': 'flv',
-                'player_url': embed_page_url}
+        info_dict.update({
+            'id': video_id,
+            'title': video_title,
+            'age_limit': age_limit,
+        })
 
-        return [info]
+        return info_dict