Merge pull request #6428 from dstftw/improve-generic-smil-support

author Sergey M. <dstftw@gmail.com>

Fri, 7 Aug 2015 23:47:33 +0000 (05:47 +0600)

committer Sergey M. <dstftw@gmail.com>

Fri, 7 Aug 2015 23:47:33 +0000 (05:47 +0600)
author Sergey M. <dstftw@gmail.com>
Fri, 7 Aug 2015 23:47:33 +0000 (05:47 +0600)
committer Sergey M. <dstftw@gmail.com>
Fri, 7 Aug 2015 23:47:33 +0000 (05:47 +0600)
diff --combined youtube_dl/extractor/common.py

index 507ea5ec0b13abc2f2077b0096f900d40861fad8,717dcec7b8e39156ad5cc33aadf64ac16e92c47a..def6caa0d1eb320a091ecaecc50db58b82d4f28d
--- 1/youtube_dl/extractor/common.py
--- 2/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@@ -18,6 -18,7 +18,7 @@@ from ..compat import 
       compat_HTTPError,
       compat_http_client,
       compat_urllib_error,
+     compat_urllib_parse,
       compat_urllib_parse_urlparse,
       compat_urllib_request,
       compat_urlparse,
@@@ -37,6 -38,7 +38,7 @@@ from ..utils import 
       RegexNotFoundError,
       sanitize_filename,
       unescapeHTML,
+     url_basename,
   )
   
   
@@@ -636,7 -638,7 +638,7 @@@ class InfoExtractor(object)
       @staticmethod
       def _meta_regex(prop):
           return r'''(?isx)<meta
- -                    (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
+ +                    (?=[^>]+(?:itemprop|name|property|id)=(["\']?)%s\1)
                       [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
   
       def _og_search_property(self, prop, html, name=None, **kargs):
@@@ -978,69 -980,167 +980,167 @@@
           self._sort_formats(formats)
           return formats
   
-     # TODO: improve extraction
-     def _extract_smil_formats(self, smil_url, video_id, fatal=True):
-         smil = self._download_xml(
-             smil_url, video_id, 'Downloading SMIL file',
-             'Unable to download SMIL file', fatal=fatal)
+     @staticmethod
+     def _xpath_ns(path, namespace=None):
+         if not namespace:
+             return path
+         out = []
+         for c in path.split('/'):
+             if not c or c == '.':
+                 out.append(c)
+             else:
+                 out.append('{%s}%s' % (namespace, c))
+         return '/'.join(out)
+ 
+     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
+         smil = self._download_smil(smil_url, video_id, fatal=fatal)
+ 
           if smil is False:
               assert not fatal
               return []
   
-         base = smil.find('./head/meta').get('base')
+         namespace = self._parse_smil_namespace(smil)
+ 
+         return self._parse_smil_formats(
+             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
+ 
+     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
+         smil = self._download_smil(smil_url, video_id, fatal=fatal)
+         if smil is False:
+             return {}
+         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
+ 
+     def _download_smil(self, smil_url, video_id, fatal=True):
+         return self._download_xml(
+             smil_url, video_id, 'Downloading SMIL file',
+             'Unable to download SMIL file', fatal=fatal)
+ 
+     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
+         namespace = self._parse_smil_namespace(smil)
+ 
+         formats = self._parse_smil_formats(
+             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
+         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
+ 
+         video_id = os.path.splitext(url_basename(smil_url))[0]
+         title = None
+         description = None
+         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
+             name = meta.attrib.get('name')
+             content = meta.attrib.get('content')
+             if not name or not content:
+                 continue
+             if not title and name == 'title':
+                 title = content
+             elif not description and name in ('description', 'abstract'):
+                 description = content
+ 
+         return {
+             'id': video_id,
+             'title': title or video_id,
+             'description': description,
+             'formats': formats,
+             'subtitles': subtitles,
+         }
+ 
+     def _parse_smil_namespace(self, smil):
+         return self._search_regex(
+             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
+ 
+     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None):
+         base = smil_url
+         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
+             b = meta.get('base') or meta.get('httpBase')
+             if b:
+                 base = b
+                 break
   
           formats = []
           rtmp_count = 0
-         if smil.findall('./body/seq/video'):
-             video = smil.findall('./body/seq/video')[0]
-             fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count)
-             formats.extend(fmts)
-         else:
-             for video in smil.findall('./body/switch/video'):
-                 fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count)
-                 formats.extend(fmts)
+         http_count = 0
+ 
+         videos = smil.findall(self._xpath_ns('.//video', namespace))
+         for video in videos:
+             src = video.get('src')
+             if not src:
+                 continue
+ 
+             bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
+             filesize = int_or_none(video.get('size') or video.get('fileSize'))
+             width = int_or_none(video.get('width'))
+             height = int_or_none(video.get('height'))
+             proto = video.get('proto')
+             ext = video.get('ext')
+             src_ext = determine_ext(src)
+             streamer = video.get('streamer') or base
+ 
+             if proto == 'rtmp' or streamer.startswith('rtmp'):
+                 rtmp_count += 1
+                 formats.append({
+                     'url': streamer,
+                     'play_path': src,
+                     'ext': 'flv',
+                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
+                     'tbr': bitrate,
+                     'filesize': filesize,
+                     'width': width,
+                     'height': height,
+                 })
+                 continue
+ 
+             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
+ 
+             if proto == 'm3u8' or src_ext == 'm3u8':
+                 formats.extend(self._extract_m3u8_formats(
+                     src_url, video_id, ext or 'mp4', m3u8_id='hls'))
+                 continue
+ 
+             if src_ext == 'f4m':
+                 f4m_url = src_url
+                 if not f4m_params:
+                     f4m_params = {
+                         'hdcore': '3.2.0',
+                         'plugin': 'flowplayer-3.2.0.1',
+                     }
+                 f4m_url += '&' if '?' in f4m_url else '?'
+                 f4m_url += compat_urllib_parse.urlencode(f4m_params)
+                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds'))
+                 continue
+ 
+             if src_url.startswith('http'):
+                 http_count += 1
+                 formats.append({
+                     'url': src_url,
+                     'ext': ext or src_ext or 'flv',
+                     'format_id': 'http-%d' % (bitrate or http_count),
+                     'tbr': bitrate,
+                     'filesize': filesize,
+                     'width': width,
+                     'height': height,
+                 })
+                 continue
   
           self._sort_formats(formats)
   
           return formats
   
-     def _parse_smil_video(self, video, video_id, base, rtmp_count):
-         src = video.get('src')
-         if not src:
-             return [], rtmp_count
-         bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
-         width = int_or_none(video.get('width'))
-         height = int_or_none(video.get('height'))
-         proto = video.get('proto')
-         if not proto:
-             if base:
-                 if base.startswith('rtmp'):
-                     proto = 'rtmp'
-                 elif base.startswith('http'):
-                     proto = 'http'
-         ext = video.get('ext')
-         if proto == 'm3u8':
-             return self._extract_m3u8_formats(src, video_id, ext), rtmp_count
-         elif proto == 'rtmp':
-             rtmp_count += 1
-             streamer = video.get('streamer') or base
-             return ([{
-                 'url': streamer,
-                 'play_path': src,
-                 'ext': 'flv',
-                 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
-                 'tbr': bitrate,
-                 'width': width,
-                 'height': height,
-             }], rtmp_count)
-         elif proto.startswith('http'):
-             return ([{
-                 'url': base + src,
-                 'ext': ext or 'flv',
-                 'tbr': bitrate,
-                 'width': width,
-                 'height': height,
-             }], rtmp_count)
+     def _parse_smil_subtitles(self, smil, namespace=None):
+         subtitles = {}
+         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
+             src = textstream.get('src')
+             if not src:
+                 continue
+             ext = textstream.get('ext') or determine_ext(src)
+             if not ext:
+                 type_ = textstream.get('type')
+                 if type_ == 'text/srt':
+                     ext = 'srt'
+             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName')
+             subtitles.setdefault(lang, []).append({
+                 'url': src,
+                 'ext': ext,
+             })
+         return subtitles
   
       def _live_title(self, name):
           """ Generate the title for a live video """
diff --combined youtube_dl/extractor/generic.py

index 469909a51f4dc899b007fecfac877229dab485b4,27584c44cf122d2f4ac41463c5311184bc2a0268..901f77304103af4aa28c3cb1104ff23a15da59cc
--- 1/youtube_dl/extractor/generic.py
--- 2/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@@ -130,6 -130,74 +130,74 @@@ class GenericIE(InfoExtractor)
                   'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
               }
           },
+         # SMIL from http://videolectures.net/promogram_igor_mekjavic_eng
+         {
+             'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/video/1/smil.xml',
+             'info_dict': {
+                 'id': 'smil',
+                 'ext': 'mp4',
+                 'title': 'Automatics, robotics and biocybernetics',
+                 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482',
+                 'formats': 'mincount:16',
+                 'subtitles': 'mincount:1',
+             },
+             'params': {
+                 'force_generic_extractor': True,
+                 'skip_download': True,
+             },
+         },
+         # SMIL from http://www1.wdr.de/mediathek/video/livestream/index.html
+         {
+             'url': 'http://metafilegenerator.de/WDR/WDR_FS/hds/hds.smil',
+             'info_dict': {
+                 'id': 'hds',
+                 'ext': 'flv',
+                 'title': 'hds',
+                 'formats': 'mincount:1',
+             },
+             'params': {
+                 'skip_download': True,
+             },
+         },
+         # SMIL from https://www.restudy.dk/video/play/id/1637
+         {
+             'url': 'https://www.restudy.dk/awsmedia/SmilDirectory/video_1637.xml',
+             'info_dict': {
+                 'id': 'video_1637',
+                 'ext': 'flv',
+                 'title': 'video_1637',
+                 'formats': 'mincount:3',
+             },
+             'params': {
+                 'skip_download': True,
+             },
+         },
+         # SMIL from http://adventure.howstuffworks.com/5266-cool-jobs-iditarod-musher-video.htm
+         {
+             'url': 'http://services.media.howstuffworks.com/videos/450221/smil-service.smil',
+             'info_dict': {
+                 'id': 'smil-service',
+                 'ext': 'flv',
+                 'title': 'smil-service',
+                 'formats': 'mincount:1',
+             },
+             'params': {
+                 'skip_download': True,
+             },
+         },
+         # SMIL from http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370
+         {
+             'url': 'http://api.new.livestream.com/accounts/1570303/events/1585861/videos/4719370.smil',
+             'info_dict': {
+                 'id': '4719370',
+                 'ext': 'mp4',
+                 'title': '571de1fd-47bc-48db-abf9-238872a58d1f',
+                 'formats': 'mincount:3',
+             },
+             'params': {
+                 'skip_download': True,
+             },
+         },
           # google redirect
           {
               'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
@@@ -236,19 -304,6 +304,19 @@@
               },
               'add_ie': ['Ooyala'],
           },
+ +        {
+ +            # ooyala video embedded with http://player.ooyala.com/iframe.js
+ +            'url': 'http://www.macrumors.com/2015/07/24/steve-jobs-the-man-in-the-machine-first-trailer/',
+ +            'info_dict': {
+ +                'id': 'p0MGJndjoG5SOKqO_hZJuZFPB-Tr5VgB',
+ +                'ext': 'mp4',
+ +                'title': '"Steve Jobs: Man in the Machine" trailer',
+ +                'description': 'The first trailer for the Alex Gibney documentary "Steve Jobs: Man in the Machine."',
+ +            },
+ +            'params': {
+ +                'skip_download': True,
+ +            },
+ +        },
           # multiple ooyala embeds on SBN network websites
           {
               'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
@@@ -1123,11 -1178,13 +1191,13 @@@
   
           self.report_extraction(video_id)
   
-         # Is it an RSS feed?
+         # Is it an RSS feed or a SMIL file?
           try:
               doc = parse_xml(webpage)
               if doc.tag == 'rss':
                   return self._extract_rss(url, video_id, doc)
+             elif re.match(r'^(?:{[^}]+})?smil$', doc.tag):
+                 return self._parse_smil(doc, url, video_id)
           except compat_xml_parse_error:
               pass
   
@@@ -1333,7 -1390,7 +1403,7 @@@
               return self.url_result(mobj.group('url'))
   
           # Look for Ooyala videos
- -        mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
+ +        mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
                   re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
                   re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or
                   re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage))
@@@ -1668,7 -1725,7 +1738,7 @@@
           if not found:
               # Broaden the findall a little bit: JWPlayer JS loader
               found = filter_video(re.findall(
- -                r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
+ +                r'[^A-Za-z0-9]?(?:file|video_url)["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
           if not found:
               # Flow player
               found = filter_video(re.findall(r'''(?xs)
author	Sergey M. <dstftw@gmail.com>
	Fri, 7 Aug 2015 23:47:33 +0000 (05:47 +0600)
committer	Sergey M. <dstftw@gmail.com>
	Fri, 7 Aug 2015 23:47:33 +0000 (05:47 +0600)
		1	2
youtube_dl/extractor/common.py	patch \|	diff1 \|	diff2 \|	blob \| history
youtube_dl/extractor/generic.py	patch \|	diff1 \|	diff2 \|	blob \| history