Merge remote-tracking branch 'origin/reuse_ies'
[youtube-dl] / youtube_dl / extractor / dailymotion.py
index 0b47d652a678c992a0ed08a02becd75e0de7af6a..1ea449ca824bbf100edf9dc851a3cd74d1dcd266 100644 (file)
@@ -1,12 +1,14 @@
 import re
 import json
+import itertools
 
 from .common import InfoExtractor
 from ..utils import (
     compat_urllib_request,
+    get_element_by_attribute,
+    get_element_by_id,
 
     ExtractorError,
-    unescapeHTML,
 )
 
 class DailymotionIE(InfoExtractor):
@@ -19,7 +21,7 @@ class DailymotionIE(InfoExtractor):
         u'file': u'x33vw9.mp4',
         u'md5': u'392c4b85a60a90dc4792da41ce3144eb',
         u'info_dict': {
-            u"uploader": u"Alex and Van .", 
+            u"uploader": u"Amphora Alex and Van .", 
             u"title": u"Tutoriel de Youtubeur\"DL DES VIDEO DE YOUTUBE\""
         }
     }
@@ -40,13 +42,6 @@ class DailymotionIE(InfoExtractor):
         # Extract URL, uploader and title from webpage
         self.report_extraction(video_id)
 
-
-        mobj = re.search(r'<meta property="og:title" content="(?P<title>[^"]*)" />', webpage)
-        if mobj is None:
-            raise ExtractorError(u'Unable to extract title')
-        video_title = unescapeHTML(mobj.group('title'))
-
-        video_uploader = None
         video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
                                              # Looking for official user
                                              r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
@@ -81,6 +76,35 @@ class DailymotionIE(InfoExtractor):
             'url':      video_url,
             'uploader': video_uploader,
             'upload_date':  video_upload_date,
-            'title':    video_title,
+            'title':    self._og_search_title(webpage),
             'ext':      video_extension,
+            'thumbnail': info['thumbnail_url']
         }]
+
+
+class DailymotionPlaylistIE(InfoExtractor):
+    _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>.+?)/'
+    _MORE_PAGES_INDICATOR = r'<div class="next">.*?<a.*?href="/playlist/.+?".*?>.*?</a>.*?</div>'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        playlist_id =  mobj.group('id')
+        video_ids = []
+
+        for pagenum in itertools.count(1):
+            webpage = self._download_webpage('https://www.dailymotion.com/playlist/%s/%s' % (playlist_id, pagenum),
+                                             playlist_id, u'Downloading page %s' % pagenum)
+
+            playlist_el = get_element_by_attribute(u'class', u'video_list', webpage)
+            video_ids.extend(re.findall(r'data-id="(.+?)" data-ext-id', playlist_el))
+
+            if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None:
+                break
+
+        entries = [self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion')
+                   for video_id in video_ids]
+        return {'_type': 'playlist',
+                'id': playlist_id,
+                'title': get_element_by_id(u'playlist_name', webpage),
+                'entries': entries,
+                }