Merge pull request #5521 from mrkrossxdx/mpv
authorSergey M. <dstftw@gmail.com>
Sat, 25 Apr 2015 18:19:59 +0000 (23:19 +0500)
committerSergey M. <dstftw@gmail.com>
Sat, 25 Apr 2015 18:19:59 +0000 (23:19 +0500)
Added support for mpv if mplayer is not available (new version)

test/test_utils.py
youtube_dl/extractor/__init__.py
youtube_dl/extractor/cspan.py
youtube_dl/extractor/ellentv.py
youtube_dl/extractor/instagram.py
youtube_dl/extractor/mtv.py
youtube_dl/extractor/orf.py
youtube_dl/extractor/southpark.py
youtube_dl/postprocessor/embedthumbnail.py
youtube_dl/postprocessor/ffmpeg.py
youtube_dl/utils.py

index 2e3a6480cb15e3c762fa68533d6dad988740a6c4..17017a8c082ea1a023b99ee3b248e5abc49cb56b 100644 (file)
@@ -58,6 +58,8 @@ from youtube_dl.utils import (
     xpath_text,
     render_table,
     match_str,
+    parse_dfxp_time_expr,
+    dfxp2srt,
 )
 
 
@@ -581,6 +583,42 @@ ffmpeg version 2.4.4 Copyright (c) 2000-2014 the FFmpeg ...'''), '2.4.4')
             'like_count > 100 & dislike_count <? 50 & description',
             {'like_count': 190, 'dislike_count': 10}))
 
+    def test_parse_dfxp_time_expr(self):
+        self.assertEqual(parse_dfxp_time_expr(None), 0.0)
+        self.assertEqual(parse_dfxp_time_expr(''), 0.0)
+        self.assertEqual(parse_dfxp_time_expr('0.1'), 0.1)
+        self.assertEqual(parse_dfxp_time_expr('0.1s'), 0.1)
+        self.assertEqual(parse_dfxp_time_expr('00:00:01'), 1.0)
+        self.assertEqual(parse_dfxp_time_expr('00:00:01.100'), 1.1)
+
+    def test_dfxp2srt(self):
+        dfxp_data = '''<?xml version="1.0" encoding="UTF-8"?>
+            <tt xmlns="http://www.w3.org/ns/ttml" xml:lang="en" xmlns:tts="http://www.w3.org/ns/ttml#parameter">
+            <body>
+                <div xml:lang="en">
+                    <p begin="0" end="1">The following line contains Chinese characters and special symbols</p>
+                    <p begin="1" end="2">第二行<br/>♪♪</p>
+                    <p begin="2" end="3"><span>Third<br/>Line</span></p>
+                </div>
+            </body>
+            </tt>'''
+        srt_data = '''1
+00:00:00,000 --> 00:00:01,000
+The following line contains Chinese characters and special symbols
+
+2
+00:00:01,000 --> 00:00:02,000
+第二行
+♪♪
+
+3
+00:00:02,000 --> 00:00:03,000
+Third
+Line
+
+'''
+        self.assertEqual(dfxp2srt(dfxp_data), srt_data)
+
 
 if __name__ == '__main__':
     unittest.main()
index a64afa1da6d13263e8f72c21ba9c1043cc0341d8..ab80fd5e0e550b4efabd0285270d17d78c1e048d 100644 (file)
@@ -478,6 +478,7 @@ from .soundgasm import (
 )
 from .southpark import (
     SouthParkIE,
+    SouthParkEsIE,
     SouthparkDeIE,
 )
 from .space import SpaceIE
index d516b14028ae08f8de540678c7407d7b97c4595d..fbefd37d09a98bb19c82b4c09b7b08c99d147d35 100644 (file)
@@ -8,6 +8,7 @@ from ..utils import (
     unescapeHTML,
     find_xpath_attr,
     smuggle_url,
+    determine_ext,
 )
 from .senateisvp import SenateISVPIE
 
@@ -87,6 +88,10 @@ class CSpanIE(InfoExtractor):
             return self.url_result(surl, 'SenateISVP', video_id, title)
 
         files = data['video']['files']
+        try:
+            capfile = data['video']['capfile']['#text']
+        except KeyError:
+            capfile = None
 
         entries = [{
             'id': '%s_%d' % (video_id, partnum + 1),
@@ -97,6 +102,12 @@ class CSpanIE(InfoExtractor):
             'description': description,
             'thumbnail': thumbnail,
             'duration': int_or_none(f.get('length', {}).get('#text')),
+            'subtitles': {
+                'en': [{
+                    'url': capfile,
+                    'ext': determine_ext(capfile, 'dfxp')
+                }],
+            } if capfile else None,
         } for partnum, f in enumerate(files)]
 
         if len(entries) == 1:
index 74b50bca2d3cafbc4c1432d19b174db12762f2ad..02c6a4615c4436fecda86fb152a131f084640612 100644 (file)
@@ -6,7 +6,6 @@ import json
 from .common import InfoExtractor
 from ..utils import (
     ExtractorError,
-    parse_iso8601,
 )
 
 
index b020e2621a5cc3c8d7ef6a1bc2cb6aaea989f779..65f6ca103973bb25c016ae92fcb551c65def31d1 100644 (file)
@@ -3,13 +3,11 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
-from ..utils import (
-    int_or_none,
-)
+from ..utils import int_or_none
 
 
 class InstagramIE(InfoExtractor):
-    _VALID_URL = r'http://instagram\.com/p/(?P<id>.*?)/'
+    _VALID_URL = r'https?://instagram\.com/p/(?P<id>[\da-zA-Z]+)'
     _TEST = {
         'url': 'http://instagram.com/p/aye83DjauH/?foo=bar#abc',
         'md5': '0d2da106a9d2631273e192b372806516',
@@ -23,8 +21,8 @@ class InstagramIE(InfoExtractor):
     }
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
+        video_id = self._match_id(url)
+
         webpage = self._download_webpage(url, video_id)
         uploader_id = self._search_regex(r'"owner":{"username":"(.+?)"',
                                          webpage, 'uploader id', fatal=False)
index 4430b3416afc8af1fab70e47fb597710d6c5a75b..b48fac5e3e434569642284d0b6388cab34696b01 100644 (file)
@@ -25,6 +25,7 @@ def _media_xml_tag(tag):
 
 class MTVServicesInfoExtractor(InfoExtractor):
     _MOBILE_TEMPLATE = None
+    _LANG = None
 
     @staticmethod
     def _id_from_uri(uri):
@@ -169,8 +170,12 @@ class MTVServicesInfoExtractor(InfoExtractor):
         video_id = self._id_from_uri(uri)
         feed_url = self._get_feed_url(uri)
         data = compat_urllib_parse.urlencode({'uri': uri})
+        info_url = feed_url + '?'
+        if self._LANG:
+            info_url += 'lang=%s&' % self._LANG
+        info_url += data
         idoc = self._download_xml(
-            feed_url + '?' + data, video_id,
+            info_url, video_id,
             'Downloading info', transform_source=fix_xml_ampersands)
         return self.playlist_result(
             [self._get_video_info(item) for item in idoc.findall('.//item')])
index ca1a5bb3cd520fd9add9f195c89d8ac13619467b..2e6c9872b5d251be4eb3c61addab113aad4d2416 100644 (file)
@@ -210,16 +210,16 @@ class ORFIPTVIE(InfoExtractor):
     _VALID_URL = r'http://iptv\.orf\.at/(?:#/)?stories/(?P<id>\d+)'
 
     _TEST = {
-        'url': 'http://iptv.orf.at/stories/2267952',
-        'md5': '26ffa4bab6dbce1eee78bbc7021016cd',
+        'url': 'http://iptv.orf.at/stories/2275236/',
+        'md5': 'c8b22af4718a4b4af58342529453e3e5',
         'info_dict': {
-            'id': '339775',
+            'id': '350612',
             'ext': 'flv',
-            'title': 'Kreml-Kritiker Nawalny wieder frei',
-            'description': 'md5:6f24e7f546d364dacd0e616a9e409236',
-            'duration': 84.729,
+            'title': 'Weitere Evakuierungen um Vulkan Calbuco',
+            'description': 'md5:d689c959bdbcf04efeddedbf2299d633',
+            'duration': 68.197,
             'thumbnail': 're:^https?://.*\.jpg$',
-            'upload_date': '20150306',
+            'upload_date': '20150425',
         },
     }
 
index c20397b3d1bbffb69188ac872facd36cce5b11f7..77758bbedfc44cbb2c433f52136ddc834ab1f046 100644 (file)
@@ -5,7 +5,7 @@ from .mtv import MTVServicesInfoExtractor
 
 class SouthParkIE(MTVServicesInfoExtractor):
     IE_NAME = 'southpark.cc.com'
-    _VALID_URL = r'https?://(www\.)?(?P<url>southpark\.cc\.com/(clips|full-episodes)/(?P<id>.+?)(\?|#|$))'
+    _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.cc\.com/(?:clips|full-episodes)/(?P<id>.+?)(\?|#|$))'
 
     _FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss'
 
@@ -20,9 +20,20 @@ class SouthParkIE(MTVServicesInfoExtractor):
     }]
 
 
+class SouthParkEsIE(SouthParkIE):
+    IE_NAME = 'southpark.cc.com:espanol'
+    _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.cc\.com/episodios-en-espanol/(?P<id>.+?)(\?|#|$))'
+    _LANG = 'es'
+
+    _TESTS = [{
+        'url': 'http://southpark.cc.com/episodios-en-espanol/s01e01-cartman-consigue-una-sonda-anal#source=351c1323-0b96-402d-a8b9-40d01b2e9bde&position=1&sort=!airdate',
+        'playlist_count': 4,
+    }]
+
+
 class SouthparkDeIE(SouthParkIE):
     IE_NAME = 'southpark.de'
-    _VALID_URL = r'https?://(www\.)?(?P<url>southpark\.de/(clips|alle-episoden)/(?P<id>.+?)(\?|#|$))'
+    _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.de/(?:clips|alle-episoden)/(?P<id>.+?)(\?|#|$))'
     _FEED_URL = 'http://www.southpark.de/feeds/video-player/mrss/'
 
     _TESTS = [{
index a2d6b14dbb47387c33c8514cd141879ab0c83753..7ba98a0ea6afbafc52e086ea29c704b8b736b948 100644 (file)
@@ -35,7 +35,8 @@ class EmbedThumbnailPP(FFmpegPostProcessor):
         compat_urlretrieve(info['thumbnail'], temp_thumbnail)
 
         if info['ext'] == 'mp3':
-            options = ['-i', temp_thumbnail, '-c', 'copy', '-map', '0', '-map', '1',
+            options = [
+                '-i', temp_thumbnail, '-c', 'copy', '-map', '0', '-map', '1',
                 '-metadata:s:v', 'title="Album cover"', '-metadata:s:v', 'comment="Cover (Front)"']
 
             self._downloader.to_screen('[ffmpeg] Adding thumbnail to "%s"' % filename)
index 7a952963e2bc44e5ddc7df8b85bd051af91299e2..1765f4969320d8ddcb5e8cceed4d0ee0b4a1ad02 100644 (file)
@@ -20,6 +20,7 @@ from ..utils import (
     prepend_extension,
     shell_quote,
     subtitles_filename,
+    dfxp2srt,
 )
 
 
@@ -651,6 +652,30 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor):
                     'format' % new_ext)
                 continue
             new_file = subtitles_filename(filename, lang, new_ext)
+
+            if ext == 'dfxp' or ext == 'ttml':
+                self._downloader.report_warning(
+                    'You have requested to convert dfxp (TTML) subtitles into another format, '
+                    'which results in style information loss')
+
+                dfxp_file = subtitles_filename(filename, lang, ext)
+                srt_file = subtitles_filename(filename, lang, 'srt')
+
+                with io.open(dfxp_file, 'rt', encoding='utf-8') as f:
+                    srt_data = dfxp2srt(f.read())
+
+                with io.open(srt_file, 'wt', encoding='utf-8') as f:
+                    f.write(srt_data)
+
+                ext = 'srt'
+                subs[lang] = {
+                    'ext': 'srt',
+                    'data': srt_data
+                }
+
+                if new_ext == 'srt':
+                    continue
+
             self.run_ffmpeg(
                 subtitles_filename(filename, lang, ext),
                 new_file, ['-f', new_format])
index edeee1853e30c2b409fe53809ae7912e7966446c..5e1c4525db400941c8b312fa825debfe9a1d11e6 100644 (file)
@@ -1800,6 +1800,59 @@ def match_filter_func(filter_str):
     return _match_func
 
 
+def parse_dfxp_time_expr(time_expr):
+    if not time_expr:
+        return 0.0
+
+    mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
+    if mobj:
+        return float(mobj.group('time_offset'))
+
+    mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
+    if mobj:
+        return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
+
+
+def format_srt_time(seconds):
+    (mins, secs) = divmod(seconds, 60)
+    (hours, mins) = divmod(mins, 60)
+    millisecs = (secs - int(secs)) * 1000
+    secs = int(secs)
+    return '%02d:%02d:%02d,%03d' % (hours, mins, secs, millisecs)
+
+
+def dfxp2srt(dfxp_data):
+    _x = functools.partial(xpath_with_ns, ns_map={'ttml': 'http://www.w3.org/ns/ttml'})
+
+    def parse_node(node):
+        str_or_empty = functools.partial(str_or_none, default='')
+
+        out = str_or_empty(node.text)
+
+        for child in node:
+            if child.tag == _x('ttml:br'):
+                out += '\n' + str_or_empty(child.tail)
+            elif child.tag == _x('ttml:span'):
+                out += str_or_empty(parse_node(child))
+            else:
+                out += str_or_empty(xml.etree.ElementTree.tostring(child))
+
+        return out
+
+    dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
+    out = []
+    paras = dfxp.findall(_x('.//ttml:p'))
+
+    for para, index in zip(paras, itertools.count(1)):
+        out.append('%d\n%s --> %s\n%s\n\n' % (
+            index,
+            format_srt_time(parse_dfxp_time_expr(para.attrib.get('begin'))),
+            format_srt_time(parse_dfxp_time_expr(para.attrib.get('end'))),
+            parse_node(para)))
+
+    return ''.join(out)
+
+
 class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
     def __init__(self, proxies=None):
         # Set default handlers