xpath_text,
render_table,
match_str,
+ parse_dfxp_time_expr,
+ dfxp2srt,
)
'like_count > 100 & dislike_count <? 50 & description',
{'like_count': 190, 'dislike_count': 10}))
+ def test_parse_dfxp_time_expr(self):
+ self.assertEqual(parse_dfxp_time_expr(None), 0.0)
+ self.assertEqual(parse_dfxp_time_expr(''), 0.0)
+ self.assertEqual(parse_dfxp_time_expr('0.1'), 0.1)
+ self.assertEqual(parse_dfxp_time_expr('0.1s'), 0.1)
+ self.assertEqual(parse_dfxp_time_expr('00:00:01'), 1.0)
+ self.assertEqual(parse_dfxp_time_expr('00:00:01.100'), 1.1)
+
+ def test_dfxp2srt(self):
+ dfxp_data = '''<?xml version="1.0" encoding="UTF-8"?>
+ <tt xmlns="http://www.w3.org/ns/ttml" xml:lang="en" xmlns:tts="http://www.w3.org/ns/ttml#parameter">
+ <body>
+ <div xml:lang="en">
+ <p begin="0" end="1">The following line contains Chinese characters and special symbols</p>
+ <p begin="1" end="2">第二行<br/>♪♪</p>
+ <p begin="2" end="3"><span>Third<br/>Line</span></p>
+ </div>
+ </body>
+ </tt>'''
+ srt_data = '''1
+00:00:00,000 --> 00:00:01,000
+The following line contains Chinese characters and special symbols
+
+2
+00:00:01,000 --> 00:00:02,000
+第二行
+♪♪
+
+3
+00:00:02,000 --> 00:00:03,000
+Third
+Line
+
+'''
+ self.assertEqual(dfxp2srt(dfxp_data), srt_data)
+
if __name__ == '__main__':
unittest.main()
)
from .southpark import (
SouthParkIE,
+ SouthParkEsIE,
SouthparkDeIE,
)
from .space import SpaceIE
unescapeHTML,
find_xpath_attr,
smuggle_url,
+ determine_ext,
)
from .senateisvp import SenateISVPIE
return self.url_result(surl, 'SenateISVP', video_id, title)
files = data['video']['files']
+ try:
+ capfile = data['video']['capfile']['#text']
+ except KeyError:
+ capfile = None
entries = [{
'id': '%s_%d' % (video_id, partnum + 1),
'description': description,
'thumbnail': thumbnail,
'duration': int_or_none(f.get('length', {}).get('#text')),
+ 'subtitles': {
+ 'en': [{
+ 'url': capfile,
+ 'ext': determine_ext(capfile, 'dfxp')
+ }],
+ } if capfile else None,
} for partnum, f in enumerate(files)]
if len(entries) == 1:
from .common import InfoExtractor
from ..utils import (
ExtractorError,
- parse_iso8601,
)
import re
from .common import InfoExtractor
-from ..utils import (
- int_or_none,
-)
+from ..utils import int_or_none
class InstagramIE(InfoExtractor):
- _VALID_URL = r'http://instagram\.com/p/(?P<id>.*?)/'
+ _VALID_URL = r'https?://instagram\.com/p/(?P<id>[\da-zA-Z]+)'
_TEST = {
'url': 'http://instagram.com/p/aye83DjauH/?foo=bar#abc',
'md5': '0d2da106a9d2631273e192b372806516',
}
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = self._match_id(url)
+
webpage = self._download_webpage(url, video_id)
uploader_id = self._search_regex(r'"owner":{"username":"(.+?)"',
webpage, 'uploader id', fatal=False)
class MTVServicesInfoExtractor(InfoExtractor):
_MOBILE_TEMPLATE = None
+ _LANG = None
@staticmethod
def _id_from_uri(uri):
video_id = self._id_from_uri(uri)
feed_url = self._get_feed_url(uri)
data = compat_urllib_parse.urlencode({'uri': uri})
+ info_url = feed_url + '?'
+ if self._LANG:
+ info_url += 'lang=%s&' % self._LANG
+ info_url += data
idoc = self._download_xml(
- feed_url + '?' + data, video_id,
+ info_url, video_id,
'Downloading info', transform_source=fix_xml_ampersands)
return self.playlist_result(
[self._get_video_info(item) for item in idoc.findall('.//item')])
_VALID_URL = r'http://iptv\.orf\.at/(?:#/)?stories/(?P<id>\d+)'
_TEST = {
- 'url': 'http://iptv.orf.at/stories/2267952',
- 'md5': '26ffa4bab6dbce1eee78bbc7021016cd',
+ 'url': 'http://iptv.orf.at/stories/2275236/',
+ 'md5': 'c8b22af4718a4b4af58342529453e3e5',
'info_dict': {
- 'id': '339775',
+ 'id': '350612',
'ext': 'flv',
- 'title': 'Kreml-Kritiker Nawalny wieder frei',
- 'description': 'md5:6f24e7f546d364dacd0e616a9e409236',
- 'duration': 84.729,
+ 'title': 'Weitere Evakuierungen um Vulkan Calbuco',
+ 'description': 'md5:d689c959bdbcf04efeddedbf2299d633',
+ 'duration': 68.197,
'thumbnail': 're:^https?://.*\.jpg$',
- 'upload_date': '20150306',
+ 'upload_date': '20150425',
},
}
class SouthParkIE(MTVServicesInfoExtractor):
IE_NAME = 'southpark.cc.com'
- _VALID_URL = r'https?://(www\.)?(?P<url>southpark\.cc\.com/(clips|full-episodes)/(?P<id>.+?)(\?|#|$))'
+ _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.cc\.com/(?:clips|full-episodes)/(?P<id>.+?)(\?|#|$))'
_FEED_URL = 'http://www.southparkstudios.com/feeds/video-player/mrss'
}]
+class SouthParkEsIE(SouthParkIE):
+ IE_NAME = 'southpark.cc.com:espanol'
+ _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.cc\.com/episodios-en-espanol/(?P<id>.+?)(\?|#|$))'
+ _LANG = 'es'
+
+ _TESTS = [{
+ 'url': 'http://southpark.cc.com/episodios-en-espanol/s01e01-cartman-consigue-una-sonda-anal#source=351c1323-0b96-402d-a8b9-40d01b2e9bde&position=1&sort=!airdate',
+ 'playlist_count': 4,
+ }]
+
+
class SouthparkDeIE(SouthParkIE):
IE_NAME = 'southpark.de'
- _VALID_URL = r'https?://(www\.)?(?P<url>southpark\.de/(clips|alle-episoden)/(?P<id>.+?)(\?|#|$))'
+ _VALID_URL = r'https?://(?:www\.)?(?P<url>southpark\.de/(?:clips|alle-episoden)/(?P<id>.+?)(\?|#|$))'
_FEED_URL = 'http://www.southpark.de/feeds/video-player/mrss/'
_TESTS = [{
compat_urlretrieve(info['thumbnail'], temp_thumbnail)
if info['ext'] == 'mp3':
- options = ['-i', temp_thumbnail, '-c', 'copy', '-map', '0', '-map', '1',
+ options = [
+ '-i', temp_thumbnail, '-c', 'copy', '-map', '0', '-map', '1',
'-metadata:s:v', 'title="Album cover"', '-metadata:s:v', 'comment="Cover (Front)"']
self._downloader.to_screen('[ffmpeg] Adding thumbnail to "%s"' % filename)
prepend_extension,
shell_quote,
subtitles_filename,
+ dfxp2srt,
)
'format' % new_ext)
continue
new_file = subtitles_filename(filename, lang, new_ext)
+
+ if ext == 'dfxp' or ext == 'ttml':
+ self._downloader.report_warning(
+ 'You have requested to convert dfxp (TTML) subtitles into another format, '
+ 'which results in style information loss')
+
+ dfxp_file = subtitles_filename(filename, lang, ext)
+ srt_file = subtitles_filename(filename, lang, 'srt')
+
+ with io.open(dfxp_file, 'rt', encoding='utf-8') as f:
+ srt_data = dfxp2srt(f.read())
+
+ with io.open(srt_file, 'wt', encoding='utf-8') as f:
+ f.write(srt_data)
+
+ ext = 'srt'
+ subs[lang] = {
+ 'ext': 'srt',
+ 'data': srt_data
+ }
+
+ if new_ext == 'srt':
+ continue
+
self.run_ffmpeg(
subtitles_filename(filename, lang, ext),
new_file, ['-f', new_format])
return _match_func
+def parse_dfxp_time_expr(time_expr):
+ if not time_expr:
+ return 0.0
+
+ mobj = re.match(r'^(?P<time_offset>\d+(?:\.\d+)?)s?$', time_expr)
+ if mobj:
+ return float(mobj.group('time_offset'))
+
+ mobj = re.match(r'^(\d+):(\d\d):(\d\d(?:\.\d+)?)$', time_expr)
+ if mobj:
+ return 3600 * int(mobj.group(1)) + 60 * int(mobj.group(2)) + float(mobj.group(3))
+
+
+def format_srt_time(seconds):
+ (mins, secs) = divmod(seconds, 60)
+ (hours, mins) = divmod(mins, 60)
+ millisecs = (secs - int(secs)) * 1000
+ secs = int(secs)
+ return '%02d:%02d:%02d,%03d' % (hours, mins, secs, millisecs)
+
+
+def dfxp2srt(dfxp_data):
+ _x = functools.partial(xpath_with_ns, ns_map={'ttml': 'http://www.w3.org/ns/ttml'})
+
+ def parse_node(node):
+ str_or_empty = functools.partial(str_or_none, default='')
+
+ out = str_or_empty(node.text)
+
+ for child in node:
+ if child.tag == _x('ttml:br'):
+ out += '\n' + str_or_empty(child.tail)
+ elif child.tag == _x('ttml:span'):
+ out += str_or_empty(parse_node(child))
+ else:
+ out += str_or_empty(xml.etree.ElementTree.tostring(child))
+
+ return out
+
+ dfxp = xml.etree.ElementTree.fromstring(dfxp_data.encode('utf-8'))
+ out = []
+ paras = dfxp.findall(_x('.//ttml:p'))
+
+ for para, index in zip(paras, itertools.count(1)):
+ out.append('%d\n%s --> %s\n%s\n\n' % (
+ index,
+ format_srt_time(parse_dfxp_time_expr(para.attrib.get('begin'))),
+ format_srt_time(parse_dfxp_time_expr(para.attrib.get('end'))),
+ parse_node(para)))
+
+ return ''.join(out)
+
+
class PerRequestProxyHandler(compat_urllib_request.ProxyHandler):
def __init__(self, proxies=None):
# Set default handlers