[utils] Use bytes-like objects in dfxp2srt

author Yen Chi Hsuan <yan12125@gmail.com>

Sat, 16 Sep 2017 04:18:38 +0000 (12:18 +0800)

committer Yen Chi Hsuan <yan12125@gmail.com>

Sat, 16 Sep 2017 04:18:38 +0000 (12:18 +0800)
author Yen Chi Hsuan <yan12125@gmail.com>
Sat, 16 Sep 2017 04:18:38 +0000 (12:18 +0800)
committer Yen Chi Hsuan <yan12125@gmail.com>
Sat, 16 Sep 2017 04:18:38 +0000 (12:18 +0800)
diff --git a/ChangeLog b/ChangeLog

index 041dfd7b98205db19345b833880c00d57df8da00..ba9260e3eb348713dd38fda4b1b38a3ff23cec7b 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,9 @@
+version <unreleased>
+
+Core
+* [utils] Fix handling raw TTML subtitles (#14191)
+
+
  version 2017.09.15
  
  Core
diff --git a/test/test_utils.py b/test/test_utils.py

index e50f3764e57050c560365eb566979e171538985b..efa73d0f45e17b76647a8e7895a5bd3c62f404b3 100644 (file)
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -1064,7 +1064,7 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4')
                      <p begin="3" dur="-1">Ignored, three</p>
                  </div>
              </body>
-            </tt>'''
+            </tt>'''.encode('utf-8')
          srt_data = '''1
  00:00:00,000 --> 00:00:01,000
  The following line contains Chinese characters and special symbols
@@ -1089,7 +1089,7 @@ Line
                      <p begin="0" end="1">The first line</p>
                  </div>
              </body>
-            </tt>'''
+            </tt>'''.encode('utf-8')
          srt_data = '''1
  00:00:00,000 --> 00:00:01,000
  The first line
@@ -1115,7 +1115,7 @@ The first line
        <p style="s1" tts:textDecoration="underline" begin="00:00:09.56" id="p2" end="00:00:12.36"><span style="s2" tts:color="lime">inner<br /> </span>style</p>
      </div>
    </body>
-</tt>'''
+</tt>'''.encode('utf-8')
          srt_data = '''1
  00:00:02,080 --> 00:00:05,839
  <font color="white" face="sansSerif" size="16">default style<font color="red">custom style</font></font>
@@ -1138,6 +1138,26 @@ part 3</font></u>
  '''
          self.assertEqual(dfxp2srt(dfxp_data_with_style), srt_data)
  
+        dfxp_data_non_utf8 = '''<?xml version="1.0" encoding="UTF-16"?>
+            <tt xmlns="http://www.w3.org/ns/ttml" xml:lang="en" xmlns:tts="http://www.w3.org/ns/ttml#parameter">
+            <body>
+                <div xml:lang="en">
+                    <p begin="0" end="1">Line 1</p>
+                    <p begin="1" end="2">第二行</p>
+                </div>
+            </body>
+            </tt>'''.encode('utf-16')
+        srt_data = '''1
+00:00:00,000 --> 00:00:01,000
+Line 1
+
+2
+00:00:01,000 --> 00:00:02,000
+第二行
+
+'''
+        self.assertEqual(dfxp2srt(dfxp_data_non_utf8), srt_data)
+
      def test_cli_option(self):
          self.assertEqual(cli_option({'proxy': '127.0.0.1:3128'}, '--proxy', 'proxy'), ['--proxy', '127.0.0.1:3128'])
          self.assertEqual(cli_option({'proxy': None}, '--proxy', 'proxy'), [])
diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py

index 51256a3fb52c0f51547c335eecf6774af07307c8..f71d413b5285005cbc98c75f0092053631e7c3f5 100644 (file)
--- a/youtube_dl/postprocessor/ffmpeg.py
+++ b/youtube_dl/postprocessor/ffmpeg.py
@@ -585,7 +585,7 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor):
                  dfxp_file = old_file
                  srt_file = subtitles_filename(filename, lang, 'srt')
  
-                with io.open(dfxp_file, 'rt', encoding='utf-8') as f:
+                with open(dfxp_file, 'rb') as f:
                      srt_data = dfxp2srt(f.read())
  
                  with io.open(srt_file, 'wt', encoding='utf-8') as f:
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py

index 9e4492d402c225d53071d1424005ff5be0577681..b724e0b70514b7077076beb9f9ca3a0f02b57fbd 100644 (file)
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -2572,14 +2572,18 @@ def srt_subtitles_timecode(seconds):
  
  
  def dfxp2srt(dfxp_data):
+    '''
+    @param dfxp_data A bytes-like object containing DFXP data
+    @returns A unicode object containing converted SRT data
+    '''
      LEGACY_NAMESPACES = (
-        ('http://www.w3.org/ns/ttml', [
-            'http://www.w3.org/2004/11/ttaf1',
-            'http://www.w3.org/2006/04/ttaf1',
-            'http://www.w3.org/2006/10/ttaf1',
+        (b'http://www.w3.org/ns/ttml', [
+            b'http://www.w3.org/2004/11/ttaf1',
+            b'http://www.w3.org/2006/04/ttaf1',
+            b'http://www.w3.org/2006/10/ttaf1',
          ]),
-        ('http://www.w3.org/ns/ttml#styling', [
-            'http://www.w3.org/ns/ttml#style',
+        (b'http://www.w3.org/ns/ttml#styling', [
+            b'http://www.w3.org/ns/ttml#style',
          ]),
      )
  
@@ -2674,7 +2678,7 @@ def dfxp2srt(dfxp_data):
          for ns in v:
              dfxp_data = dfxp_data.replace(ns, k)
  
-    dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
+    dfxp = compat_etree_fromstring(dfxp_data)
      out = []
      paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
author	Yen Chi Hsuan <yan12125@gmail.com>
	Sat, 16 Sep 2017 04:18:38 +0000 (12:18 +0800)
committer	Yen Chi Hsuan <yan12125@gmail.com>
	Sat, 16 Sep 2017 04:18:38 +0000 (12:18 +0800)
ChangeLog		patch \| blob \| history
test/test_utils.py		patch \| blob \| history
youtube_dl/postprocessor/ffmpeg.py		patch \| blob \| history
youtube_dl/utils.py		patch \| blob \| history