[generic] extract dash formats detected using content type

[youtube-dl] / youtube_dl / extractor / generic.py
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py

index 7cf13fddfe37fa1bf2d2e6329c060f9d4c6286f7..4c2c76dc57d9ec52547e0601fc16d45bb4db7e08 100644 (file)
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -44,7 +44,6 @@ from .myvi import MyviIE
  from .condenast import CondeNastIE
  from .udn import UDNEmbedIE
  from .senateisvp import SenateISVPIE
-from .bliptv import BlipTVIE
  from .svt import SVTIE
  from .pornhub import PornHubIE
  from .xhamster import XHamsterEmbedIE
@@ -55,7 +54,10 @@ from .snagfilms import SnagFilmsEmbedIE
  from .screenwavemedia import ScreenwaveMediaIE
  from .mtv import MTVServicesEmbeddedIE
  from .pladform import PladformIE
+from .videomore import VideomoreIE
  from .googledrive import GoogleDriveIE
+from .jwplatform import JWPlatformIE
+from .digiteka import DigitekaIE
  
  
  class GenericIE(InfoExtractor):
@@ -485,7 +487,7 @@ class GenericIE(InfoExtractor):
                  'description': 'md5:8145d19d320ff3e52f28401f4c4283b9',
              }
          },
-        # Embeded Ustream video
+        # Embedded Ustream video
          {
              'url': 'http://www.american.edu/spa/pti/nsa-privacy-janus-2014.cfm',
              'md5': '27b99cdb639c9b12a79bca876a073417',
@@ -1227,19 +1229,26 @@ class GenericIE(InfoExtractor):
  
          # Check for direct link to a video
          content_type = head_response.headers.get('Content-Type', '')
-        m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
+        m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|dash\+xml|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>.+)$', content_type)
          if m:
              upload_date = unified_strdate(
                  head_response.headers.get('Last-Modified'))
+            formats = []
+            if m.group('format_id').endswith('mpegurl'):
+                formats = self._extract_m3u8_formats(url, video_id, 'mp4')
+            elif m.group('format_id').startswith('dash+xml'):
+                formats = self._extract_mpd_formats(url, video_id)
+            else:
+                formats = [{
+                    'format_id': m.group('format_id'),
+                    'url': url,
+                    'vcodec': 'none' if m.group('type') == 'audio' else None
+                }]
              return {
                  'id': video_id,
                  'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
                  'direct': True,
-                'formats': [{
-                    'format_id': m.group('format_id'),
-                    'url': url,
-                    'vcodec': 'none' if m.group('type') == 'audio' else None
-                }],
+                'formats': formats,
                  'upload_date': upload_date,
              }
  
@@ -1400,7 +1409,7 @@ class GenericIE(InfoExtractor):
  
          # Look for embedded Dailymotion player
          matches = re.findall(
-            r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/embed/video/.+?)\1', webpage)
+            r'<(?:embed|iframe)[^>]+?src=(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage)
          if matches:
              return _playlist_from_matches(
                  matches, lambda m: unescapeHTML(m[1]))
@@ -1441,11 +1450,6 @@ class GenericIE(InfoExtractor):
                  'id': match.group('id')
              }
  
-        # Look for embedded blip.tv player
-        bliptv_url = BlipTVIE._extract_url(webpage)
-        if bliptv_url:
-            return self.url_result(bliptv_url, 'BlipTV')
-
          # Look for SVT player
          svt_url = SVTIE._extract_url(webpage)
          if svt_url:
@@ -1647,7 +1651,7 @@ class GenericIE(InfoExtractor):
          if myvi_url:
              return self.url_result(myvi_url)
  
-        # Look for embeded soundcloud player
+        # Look for embedded soundcloud player
          mobj = re.search(
              r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"',
              webpage)
@@ -1747,6 +1751,11 @@ class GenericIE(InfoExtractor):
          if pladform_url:
              return self.url_result(pladform_url)
  
+        # Look for Videomore embeds
+        videomore_url = VideomoreIE._extract_url(webpage)
+        if videomore_url:
+            return self.url_result(videomore_url)
+
          # Look for Playwire embeds
          mobj = re.search(
              r'<script[^>]+data-config=(["\'])(?P<url>(?:https?:)?//config\.playwire\.com/.+?)\1', webpage)
@@ -1802,11 +1811,32 @@ class GenericIE(InfoExtractor):
          if snagfilms_url:
              return self.url_result(snagfilms_url)
  
+        # Look for JWPlatform embeds
+        jwplatform_url = JWPlatformIE._extract_url(webpage)
+        if jwplatform_url:
+            return self.url_result(jwplatform_url, 'JWPlatform')
+
          # Look for ScreenwaveMedia embeds
          mobj = re.search(ScreenwaveMediaIE.EMBED_PATTERN, webpage)
          if mobj is not None:
              return self.url_result(unescapeHTML(mobj.group('url')), 'ScreenwaveMedia')
  
+        # Look for Digiteka embeds
+        digiteka_url = DigitekaIE._extract_url(webpage)
+        if digiteka_url:
+            return self.url_result(self._proto_relative_url(digiteka_url), DigitekaIE.ie_key())
+
+        # Look for Limelight embeds
+        mobj = re.search(r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})', webpage)
+        if mobj:
+            lm = {
+                'Media': 'media',
+                'Channel': 'channel',
+                'ChannelList': 'channel_list',
+            }
+            return self.url_result('limelight:%s:%s' % (
+                lm[mobj.group(1)], mobj.group(2)), 'Limelight%s' % mobj.group(1), mobj.group(2))
+
          # Look for AdobeTVVideo embeds
          mobj = re.search(
              r'<iframe[^>]+src=[\'"]((?:https?:)?//video\.tv\.adobe\.com/v/\d+[^"]+)[\'"]',