Merge pull request #6428 from dstftw/improve-generic-smil-support
authorSergey M. <dstftw@gmail.com>
Fri, 7 Aug 2015 23:47:33 +0000 (05:47 +0600)
committerSergey M. <dstftw@gmail.com>
Fri, 7 Aug 2015 23:47:33 +0000 (05:47 +0600)
Improve generic SMIL support

1  2 
youtube_dl/extractor/common.py
youtube_dl/extractor/generic.py

index 507ea5ec0b13abc2f2077b0096f900d40861fad8,717dcec7b8e39156ad5cc33aadf64ac16e92c47a..def6caa0d1eb320a091ecaecc50db58b82d4f28d
@@@ -18,6 -18,7 +18,7 @@@ from ..compat import 
      compat_HTTPError,
      compat_http_client,
      compat_urllib_error,
+     compat_urllib_parse,
      compat_urllib_parse_urlparse,
      compat_urllib_request,
      compat_urlparse,
@@@ -37,6 -38,7 +38,7 @@@ from ..utils import 
      RegexNotFoundError,
      sanitize_filename,
      unescapeHTML,
+     url_basename,
  )
  
  
@@@ -636,7 -638,7 +638,7 @@@ class InfoExtractor(object)
      @staticmethod
      def _meta_regex(prop):
          return r'''(?isx)<meta
 -                    (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
 +                    (?=[^>]+(?:itemprop|name|property|id)=(["\']?)%s\1)
                      [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
  
      def _og_search_property(self, prop, html, name=None, **kargs):
          self._sort_formats(formats)
          return formats
  
-     # TODO: improve extraction
-     def _extract_smil_formats(self, smil_url, video_id, fatal=True):
-         smil = self._download_xml(
-             smil_url, video_id, 'Downloading SMIL file',
-             'Unable to download SMIL file', fatal=fatal)
+     @staticmethod
+     def _xpath_ns(path, namespace=None):
+         if not namespace:
+             return path
+         out = []
+         for c in path.split('/'):
+             if not c or c == '.':
+                 out.append(c)
+             else:
+                 out.append('{%s}%s' % (namespace, c))
+         return '/'.join(out)
+     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None):
+         smil = self._download_smil(smil_url, video_id, fatal=fatal)
          if smil is False:
              assert not fatal
              return []
  
-         base = smil.find('./head/meta').get('base')
+         namespace = self._parse_smil_namespace(smil)
+         return self._parse_smil_formats(
+             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
+     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
+         smil = self._download_smil(smil_url, video_id, fatal=fatal)
+         if smil is False:
+             return {}
+         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
+     def _download_smil(self, smil_url, video_id, fatal=True):
+         return self._download_xml(
+             smil_url, video_id, 'Downloading SMIL file',
+             'Unable to download SMIL file', fatal=fatal)
+     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
+         namespace = self._parse_smil_namespace(smil)
+         formats = self._parse_smil_formats(
+             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
+         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
+         video_id = os.path.splitext(url_basename(smil_url))[0]
+         title = None
+         description = None
+         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
+             name = meta.attrib.get('name')
+             content = meta.attrib.get('content')
+             if not name or not content:
+                 continue
+             if not title and name == 'title':
+                 title = content
+             elif not description and name in ('description', 'abstract'):
+                 description = content
+         return {
+             'id': video_id,
+             'title': title or video_id,
+             'description': description,
+             'formats': formats,
+             'subtitles': subtitles,
+         }
+     def _parse_smil_namespace(self, smil):
+         return self._search_regex(
+             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
+     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None):
+         base = smil_url
+         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
+             b = meta.get('base') or meta.get('httpBase')
+             if b:
+                 base = b
+                 break
  
          formats = []
          rtmp_count = 0
-         if smil.findall('./body/seq/video'):
-             video = smil.findall('./body/seq/video')[0]
-             fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count)
-             formats.extend(fmts)
-         else:
-             for video in smil.findall('./body/switch/video'):
-                 fmts, rtmp_count = self._parse_smil_video(video, video_id, base, rtmp_count)
-                 formats.extend(fmts)
+         http_count = 0
+         videos = smil.findall(self._xpath_ns('.//video', namespace))
+         for video in videos:
+             src = video.get('src')
+             if not src:
+                 continue
+             bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
+             filesize = int_or_none(video.get('size') or video.get('fileSize'))
+             width = int_or_none(video.get('width'))
+             height = int_or_none(video.get('height'))
+             proto = video.get('proto')
+             ext = video.get('ext')
+             src_ext = determine_ext(src)
+             streamer = video.get('streamer') or base
+             if proto == 'rtmp' or streamer.startswith('rtmp'):
+                 rtmp_count += 1
+                 formats.append({
+                     'url': streamer,
+                     'play_path': src,
+                     'ext': 'flv',
+                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
+                     'tbr': bitrate,
+                     'filesize': filesize,
+                     'width': width,
+                     'height': height,
+                 })
+                 continue
+             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
+             if proto == 'm3u8' or src_ext == 'm3u8':
+                 formats.extend(self._extract_m3u8_formats(
+                     src_url, video_id, ext or 'mp4', m3u8_id='hls'))
+                 continue
+             if src_ext == 'f4m':
+                 f4m_url = src_url
+                 if not f4m_params:
+                     f4m_params = {
+                         'hdcore': '3.2.0',
+                         'plugin': 'flowplayer-3.2.0.1',
+                     }
+                 f4m_url += '&' if '?' in f4m_url else '?'
+                 f4m_url += compat_urllib_parse.urlencode(f4m_params)
+                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds'))
+                 continue
+             if src_url.startswith('http'):
+                 http_count += 1
+                 formats.append({
+                     'url': src_url,
+                     'ext': ext or src_ext or 'flv',
+                     'format_id': 'http-%d' % (bitrate or http_count),
+                     'tbr': bitrate,
+                     'filesize': filesize,
+                     'width': width,
+                     'height': height,
+                 })
+                 continue
  
          self._sort_formats(formats)
  
          return formats
  
-     def _parse_smil_video(self, video, video_id, base, rtmp_count):
-         src = video.get('src')
-         if not src:
-             return [], rtmp_count
-         bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000)
-         width = int_or_none(video.get('width'))
-         height = int_or_none(video.get('height'))
-         proto = video.get('proto')
-         if not proto:
-             if base:
-                 if base.startswith('rtmp'):
-                     proto = 'rtmp'
-                 elif base.startswith('http'):
-                     proto = 'http'
-         ext = video.get('ext')
-         if proto == 'm3u8':
-             return self._extract_m3u8_formats(src, video_id, ext), rtmp_count
-         elif proto == 'rtmp':
-             rtmp_count += 1
-             streamer = video.get('streamer') or base
-             return ([{
-                 'url': streamer,
-                 'play_path': src,
-                 'ext': 'flv',
-                 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
-                 'tbr': bitrate,
-                 'width': width,
-                 'height': height,
-             }], rtmp_count)
-         elif proto.startswith('http'):
-             return ([{
-                 'url': base + src,
-                 'ext': ext or 'flv',
-                 'tbr': bitrate,
-                 'width': width,
-                 'height': height,
-             }], rtmp_count)
+     def _parse_smil_subtitles(self, smil, namespace=None):
+         subtitles = {}
+         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
+             src = textstream.get('src')
+             if not src:
+                 continue
+             ext = textstream.get('ext') or determine_ext(src)
+             if not ext:
+                 type_ = textstream.get('type')
+                 if type_ == 'text/srt':
+                     ext = 'srt'
+             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName')
+             subtitles.setdefault(lang, []).append({
+                 'url': src,
+                 'ext': ext,
+             })
+         return subtitles
  
      def _live_title(self, name):
          """ Generate the title for a live video """
index 469909a51f4dc899b007fecfac877229dab485b4,27584c44cf122d2f4ac41463c5311184bc2a0268..901f77304103af4aa28c3cb1104ff23a15da59cc
@@@ -130,6 -130,74 +130,74 @@@ class GenericIE(InfoExtractor)
                  'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
              }
          },
+         # SMIL from http://videolectures.net/promogram_igor_mekjavic_eng
+         {
+             'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/video/1/smil.xml',
+             'info_dict': {
+                 'id': 'smil',
+                 'ext': 'mp4',
+                 'title': 'Automatics, robotics and biocybernetics',
+                 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482',
+                 'formats': 'mincount:16',
+                 'subtitles': 'mincount:1',
+             },
+             'params': {
+                 'force_generic_extractor': True,
+                 'skip_download': True,
+             },
+         },
+         # SMIL from http://www1.wdr.de/mediathek/video/livestream/index.html
+         {
+             'url': 'http://metafilegenerator.de/WDR/WDR_FS/hds/hds.smil',
+             'info_dict': {
+                 'id': 'hds',
+                 'ext': 'flv',
+                 'title': 'hds',
+                 'formats': 'mincount:1',
+             },
+             'params': {
+                 'skip_download': True,
+             },
+         },
+         # SMIL from https://www.restudy.dk/video/play/id/1637
+         {
+             'url': 'https://www.restudy.dk/awsmedia/SmilDirectory/video_1637.xml',
+             'info_dict': {
+                 'id': 'video_1637',
+                 'ext': 'flv',
+                 'title': 'video_1637',
+                 'formats': 'mincount:3',
+             },
+             'params': {
+                 'skip_download': True,
+             },
+         },
+         # SMIL from http://adventure.howstuffworks.com/5266-cool-jobs-iditarod-musher-video.htm
+         {
+             'url': 'http://services.media.howstuffworks.com/videos/450221/smil-service.smil',
+             'info_dict': {
+                 'id': 'smil-service',
+                 'ext': 'flv',
+                 'title': 'smil-service',
+                 'formats': 'mincount:1',
+             },
+             'params': {
+                 'skip_download': True,
+             },
+         },
+         # SMIL from http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370
+         {
+             'url': 'http://api.new.livestream.com/accounts/1570303/events/1585861/videos/4719370.smil',
+             'info_dict': {
+                 'id': '4719370',
+                 'ext': 'mp4',
+                 'title': '571de1fd-47bc-48db-abf9-238872a58d1f',
+                 'formats': 'mincount:3',
+             },
+             'params': {
+                 'skip_download': True,
+             },
+         },
          # google redirect
          {
              'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE',
              },
              'add_ie': ['Ooyala'],
          },
 +        {
 +            # ooyala video embedded with http://player.ooyala.com/iframe.js
 +            'url': 'http://www.macrumors.com/2015/07/24/steve-jobs-the-man-in-the-machine-first-trailer/',
 +            'info_dict': {
 +                'id': 'p0MGJndjoG5SOKqO_hZJuZFPB-Tr5VgB',
 +                'ext': 'mp4',
 +                'title': '"Steve Jobs: Man in the Machine" trailer',
 +                'description': 'The first trailer for the Alex Gibney documentary "Steve Jobs: Man in the Machine."',
 +            },
 +            'params': {
 +                'skip_download': True,
 +            },
 +        },
          # multiple ooyala embeds on SBN network websites
          {
              'url': 'http://www.sbnation.com/college-football-recruiting/2015/2/3/7970291/national-signing-day-rationalizations-itll-be-ok-itll-be-ok',
  
          self.report_extraction(video_id)
  
-         # Is it an RSS feed?
+         # Is it an RSS feed or a SMIL file?
          try:
              doc = parse_xml(webpage)
              if doc.tag == 'rss':
                  return self._extract_rss(url, video_id, doc)
+             elif re.match(r'^(?:{[^}]+})?smil$', doc.tag):
+                 return self._parse_smil(doc, url, video_id)
          except compat_xml_parse_error:
              pass
  
              return self.url_result(mobj.group('url'))
  
          # Look for Ooyala videos
 -        mobj = (re.search(r'player\.ooyala\.com/[^"?]+\?[^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
 +        mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
                  re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
                  re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or
                  re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage))
          if not found:
              # Broaden the findall a little bit: JWPlayer JS loader
              found = filter_video(re.findall(
 -                r'[^A-Za-z0-9]?file["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
 +                r'[^A-Za-z0-9]?(?:file|video_url)["\']?:\s*["\'](http(?![^\'"]+\.[0-9]+[\'"])[^\'"]+)["\']', webpage))
          if not found:
              # Flow player
              found = filter_video(re.findall(r'''(?xs)