[dailymotion] Added support for subtitles + new InfoExtractor for
authorIsmael Mejia <iemejia@gmail.com>
Wed, 7 Aug 2013 16:59:11 +0000 (18:59 +0200)
committerIsmael Mejia <iemejia@gmail.com>
Wed, 7 Aug 2013 16:59:11 +0000 (18:59 +0200)
generic subtitle download.

The idea is that all subtitle downloaders must descend from SubtitlesIE
and implement only three basic methods to achieve the complete subtitle
download functionality. This will allow to reduce the code in YoutubeIE
once it is rewritten.

test/test_dailymotion_subtitles.py [new file with mode: 0644]
youtube_dl/__init__.py
youtube_dl/extractor/dailymotion.py
youtube_dl/extractor/subtitles.py [new file with mode: 0644]

diff --git a/test/test_dailymotion_subtitles.py b/test/test_dailymotion_subtitles.py
new file mode 100644 (file)
index 0000000..f63426a
--- /dev/null
@@ -0,0 +1,96 @@
+#!/usr/bin/env python
+
+import sys
+import unittest
+import json
+import io
+import hashlib
+
+# Allow direct execution
+import os
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from youtube_dl.extractor import DailymotionIE
+from youtube_dl.utils import *
+from helper import FakeYDL
+
+md5 = lambda s: hashlib.md5(s.encode('utf-8')).hexdigest()
+TEST_URL = 'http://www.dailymotion.com/video/xczg00'
+
+class TestDailymotionSubtitles(unittest.TestCase):
+    def setUp(self):
+        DL = FakeYDL()
+        DL.params['allsubtitles'] = False
+        DL.params['writesubtitles'] = False
+        DL.params['subtitlesformat'] = 'srt'
+        DL.params['listsubtitles'] = False
+    def test_no_subtitles(self):
+        DL = FakeYDL()
+        DL.params['writesubtitles'] = False
+        IE = DailymotionIE(DL)
+        info_dict = IE.extract(TEST_URL)
+        subtitles = info_dict[0]['subtitles']
+        self.assertEqual(subtitles, None)
+    def test_subtitles(self):
+        DL = FakeYDL()
+        DL.params['writesubtitles'] = True
+        IE = DailymotionIE(DL)
+        info_dict = IE.extract(TEST_URL)
+        sub = info_dict[0]['subtitles']['en']
+        self.assertEqual(md5(sub), '976553874490cba125086bbfea3ff76f')
+    def test_subtitles_fr(self):
+        DL = FakeYDL()
+        DL.params['writesubtitles'] = True
+        DL.params['subtitleslang'] = 'fr'
+        IE = DailymotionIE(DL)
+        info_dict = IE.extract(TEST_URL)
+        sub = info_dict[0]['subtitles']['fr']
+        self.assertEqual(md5(sub), '594564ec7d588942e384e920e5341792')
+    def test_onlysubtitles(self):
+        DL = FakeYDL()
+        DL.params['writesubtitles'] = True
+        DL.params['onlysubtitles'] = True
+        IE = DailymotionIE(DL)
+        info_dict = IE.extract(TEST_URL)
+        sub = info_dict[0]['subtitles']['en']
+        self.assertEqual(md5(sub), '976553874490cba125086bbfea3ff76f')
+    def test_allsubtitles(self):
+        DL = FakeYDL()
+        DL.params['allsubtitles'] = True
+        IE = DailymotionIE(DL)
+        info_dict = IE.extract(TEST_URL)
+        subtitles = info_dict[0]['subtitles']
+        self.assertEqual(len(subtitles.keys()), 5)
+    # def test_subtitles_sbv_format(self):
+    #     DL = FakeYDL()
+    #     DL.params['writesubtitles'] = True
+    #     DL.params['subtitlesformat'] = 'sbv'
+    #     IE = DailymotionIE(DL)
+    #     info_dict = IE.extract(TEST_URL)
+    #     sub = info_dict[0]['subtitles'][0]
+    #     self.assertEqual(md5(sub), '13aeaa0c245a8bed9a451cb643e3ad8b')
+    # def test_subtitles_vtt_format(self):
+    #     DL = FakeYDL()
+    #     DL.params['writesubtitles'] = True
+    #     DL.params['subtitlesformat'] = 'vtt'
+    #     IE = DailymotionIE(DL)
+    #     info_dict = IE.extract(TEST_URL)
+    #     sub = info_dict[0]['subtitles'][0]
+    #     self.assertEqual(md5(sub), '356cdc577fde0c6783b9b822e7206ff7')
+    def test_list_subtitles(self):
+        DL = FakeYDL()
+        DL.params['listsubtitles'] = True
+        IE = DailymotionIE(DL)
+        info_dict = IE.extract(TEST_URL)
+        self.assertEqual(info_dict, None)
+    def test_automatic_captions(self):
+        DL = FakeYDL()
+        DL.params['writeautomaticsub'] = True
+        DL.params['subtitleslang'] = 'en'
+        IE = DailymotionIE(DL)
+        info_dict = IE.extract(TEST_URL)
+        sub = info_dict[0]['subtitles']
+        self.assertTrue(len(sub) == 0)
+
+if __name__ == '__main__':
+    unittest.main()
index eb23c53a570fa84eabd64ec7f83a2fc045a8727c..c4d595e1c1643a581a46ee8e546104075df759db 100644 (file)
@@ -187,22 +187,22 @@ def parseOpts(overrideArguments=None):
             action='store_true', dest='listformats', help='list all available formats (currently youtube only)')
     video_format.add_option('--write-sub', '--write-srt',
             action='store_true', dest='writesubtitles',
-            help='write subtitle file (currently youtube only)', default=False)
+            help='write subtitle file', default=False)
     video_format.add_option('--write-auto-sub', '--write-automatic-sub',
             action='store_true', dest='writeautomaticsub',
-            help='write automatic subtitle file (currently youtube only)', default=False)
+            help='write automatic subtitle file (youtube only)', default=False)
     video_format.add_option('--only-sub',
             action='store_true', dest='skip_download',
             help='[deprecated] alias of --skip-download', default=False)
     video_format.add_option('--all-subs',
             action='store_true', dest='allsubtitles',
-            help='downloads all the available subtitles of the video (currently youtube only)', default=False)
+            help='downloads all the available subtitles of the video', default=False)
     video_format.add_option('--list-subs',
             action='store_true', dest='listsubtitles',
-            help='lists all available subtitles for the video (currently youtube only)', default=False)
+            help='lists all available subtitles for the video', default=False)
     video_format.add_option('--sub-format',
             action='store', dest='subtitlesformat', metavar='FORMAT',
-            help='subtitle format [srt/sbv/vtt] (default=srt) (currently youtube only)', default='srt')
+            help='subtitle format (default=srt) ([sbv/vtt] youtube only)', default='srt')
     video_format.add_option('--sub-lang', '--srt-lang',
             action='store', dest='subtitleslang', metavar='LANG',
             help='language of the subtitles to download (optional) use IETF language tags like \'en\'')
index 9bf7a28ca83248ac61d3cc64d98058568b98dde6..eb2322d54778673bf4d7e718d0393c80b2217e8c 100644 (file)
@@ -1,14 +1,49 @@
 import re
 import json
+import itertools
+import socket
 
 from .common import InfoExtractor
+from .subtitles import SubtitlesIE
+
 from ..utils import (
+    compat_http_client,
+    compat_urllib_error,
     compat_urllib_request,
+    compat_str,
+    get_element_by_attribute,
+    get_element_by_id,
 
     ExtractorError,
 )
 
-class DailymotionIE(InfoExtractor):
+
+class DailyMotionSubtitlesIE(SubtitlesIE):
+
+    def _get_available_subtitles(self, video_id):
+        request = compat_urllib_request.Request('https://api.dailymotion.com/video/%s/subtitles?fields=id,language,url' % video_id)
+        try:
+            sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8')
+        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+            self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
+            return {}
+        info = json.loads(sub_list)
+        if (info['total'] > 0):
+            sub_lang_list = dict((l['language'], l['url']) for l in info['list'])
+            return sub_lang_list
+        self._downloader.report_warning(u'video doesn\'t have subtitles')
+        return {}
+
+    def _get_subtitle_url(self, sub_lang, sub_name, video_id, format):
+        sub_lang_list = self._get_available_subtitles(video_id)
+        return sub_lang_list[sub_lang]
+
+    def _request_automatic_caption(self, video_id, webpage):
+        self._downloader.report_warning(u'Automatic Captions not supported by dailymotion')
+        return {}
+
+
+class DailymotionIE(DailyMotionSubtitlesIE): #,InfoExtractor):
     """Information Extractor for Dailymotion"""
 
     _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)'
@@ -18,7 +53,7 @@ class DailymotionIE(InfoExtractor):
         u'file': u'x33vw9.mp4',
         u'md5': u'392c4b85a60a90dc4792da41ce3144eb',
         u'info_dict': {
-            u"uploader": u"Alex and Van .", 
+            u"uploader": u"Alex and Van .",
             u"title": u"Tutoriel de Youtubeur\"DL DES VIDEO DE YOUTUBE\""
         }
     }
@@ -57,17 +92,36 @@ class DailymotionIE(InfoExtractor):
 
         # TODO: support choosing qualities
 
-        for key in ['stream_h264_hd1080_url','stream_h264_hd_url',
-                    'stream_h264_hq_url','stream_h264_url',
+        for key in ['stream_h264_hd1080_url', 'stream_h264_hd_url',
+                    'stream_h264_hq_url', 'stream_h264_url',
                     'stream_h264_ld_url']:
-            if info.get(key):#key in info and info[key]:
+            if info.get(key):  # key in info and info[key]:
                 max_quality = key
-                self.to_screen(u'Using %s' % key)
+                self.to_screen(u'%s: Using %s' % (video_id, key))
                 break
         else:
             raise ExtractorError(u'Unable to extract video URL')
         video_url = info[max_quality]
 
+        # subtitles
+        video_subtitles = None
+        video_webpage = None
+
+        if self._downloader.params.get('writesubtitles', False) or self._downloader.params.get('allsubtitles', False):
+            video_subtitles = self._extract_subtitles(video_id)
+        elif self._downloader.params.get('writeautomaticsub', False):
+            video_subtitles = self._request_automatic_caption(video_id, video_webpage)
+
+        if self._downloader.params.get('listsubtitles', False):
+            self._list_available_subtitles(video_id)
+            return
+
+        if 'length_seconds' not in info:
+            self._downloader.report_warning(u'unable to extract video duration')
+            video_duration = ''
+        else:
+            video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
+
         return [{
             'id':       video_id,
             'url':      video_url,
@@ -75,5 +129,6 @@ class DailymotionIE(InfoExtractor):
             'upload_date':  video_upload_date,
             'title':    self._og_search_title(webpage),
             'ext':      video_extension,
+            'subtitles':    video_subtitles,
             'thumbnail': info['thumbnail_url']
         }]
diff --git a/youtube_dl/extractor/subtitles.py b/youtube_dl/extractor/subtitles.py
new file mode 100644 (file)
index 0000000..89864e5
--- /dev/null
@@ -0,0 +1,80 @@
+import socket
+
+from .common import InfoExtractor
+
+from ..utils import (
+    compat_http_client,
+    compat_urllib_error,
+    compat_urllib_request,
+    compat_str,
+)
+
+
+class SubtitlesIE(InfoExtractor):
+
+    def report_video_subtitles_available(self, video_id, sub_lang_list):
+        """Report available subtitles."""
+        sub_lang = ",".join(list(sub_lang_list.keys()))
+        self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang))
+
+    def _list_available_subtitles(self, video_id):
+        sub_lang_list = self._get_available_subtitles(video_id)
+        self.report_video_subtitles_available(video_id, sub_lang_list)
+
+    def _extract_subtitles(self, video_id):
+        """
+        Return a dictionary: {language: subtitles} or {} if the subtitles
+        couldn't be found
+        """
+        sub_lang_list = self._get_available_subtitles(video_id)
+        sub_format = self._downloader.params.get('subtitlesformat')
+        if  not sub_lang_list: #There was some error, it didn't get the available subtitles
+            return {}
+        if self._downloader.params.get('writesubtitles', False):
+            if self._downloader.params.get('subtitleslang', False):
+                sub_lang = self._downloader.params.get('subtitleslang')
+            elif 'en' in sub_lang_list:
+                sub_lang = 'en'
+            else:
+                sub_lang = list(sub_lang_list.keys())[0]
+            if not sub_lang in sub_lang_list:
+                self._downloader.report_warning(u'no closed captions found in the specified language "%s"' % sub_lang)
+                return {}
+            sub_lang_list = {sub_lang: sub_lang_list[sub_lang]}
+        subtitles = {}
+        for sub_lang in sub_lang_list:
+            subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format)
+            if subtitle:
+                subtitles[sub_lang] = subtitle
+        return subtitles
+
+    def _request_subtitle(self, sub_lang, sub_name, video_id, format):
+        """ Return the subtitle as a string or None if they are not found """
+        # return (u'Did not fetch video subtitles for %s' % sub_lang, None, None)
+        self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format))
+        url = self._get_subtitle_url(sub_lang, sub_name, video_id, format)
+        try:
+            sub = compat_urllib_request.urlopen(url).read().decode('utf-8')
+        except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+            self._downloader.report_warning(u'unable to download video subtitles for %s: %s' % (sub_lang, compat_str(err)))
+            return
+        if not sub:
+            self._downloader.report_warning(u'Did not fetch video subtitles')
+            return
+        return sub
+
+    def _get_available_subtitles(self, video_id):
+        """Get available subtitles. Redefine in subclasses."""
+        """returns {(lang, url)} """
+        # return {}
+        pass
+
+    def _get_subtitle_url(self, sub_lang, sub_name, video_id, format):
+        """returns the url for the given subtitle. Redefine in subclasses."""
+        pass
+
+    def _request_automatic_caption(self, video_id, webpage):
+        """Request automatic caption. Redefine in subclasses."""
+        """returns a tuple of ... """
+        # return [(err_msg, None, None)]
+        pass