[limelight] Improve embeds extraction (closes #12761)
authorSergey M․ <dstftw@gmail.com>
Sun, 16 Apr 2017 17:23:16 +0000 (00:23 +0700)
committerSergey M․ <dstftw@gmail.com>
Sun, 16 Apr 2017 17:23:16 +0000 (00:23 +0700)
* Move extraction code to extractor
* Add extraction for LimelightEmbeddedPlayerFlash embeds
* Extract multiple video

youtube_dl/extractor/generic.py
youtube_dl/extractor/limelight.py

index 6a34c2491ebcbef18b3c7017a0567c433acaa842..c523abb25f6921f44d63c28a398156280b9bde76 100644 (file)
@@ -85,6 +85,7 @@ from .ustream import UstreamIE
 from .openload import OpenloadIE
 from .videopress import VideoPressIE
 from .rutube import RutubeIE
+from .limelight import LimelightBaseIE
 
 
 class GenericIE(InfoExtractor):
@@ -2483,6 +2484,11 @@ class GenericIE(InfoExtractor):
             return self.url_result(piksel_url, PikselIE.ie_key())
 
         # Look for Limelight embeds
+        limelight_urls = LimelightBaseIE._extract_urls(webpage, url)
+        if limelight_urls:
+            return self.playlist_result(
+                limelight_urls, video_id, video_title, video_description)
+
         mobj = re.search(r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})', webpage)
         if mobj:
             lm = {
index f52c2e169554a7968c1535e9c67aa82430378dce..0041453af86f514f2bd15b00b4763ca129a3d164 100644 (file)
@@ -9,6 +9,7 @@ from ..utils import (
     determine_ext,
     float_or_none,
     int_or_none,
+    smuggle_url,
     unsmuggle_url,
     ExtractorError,
 )
@@ -18,6 +19,42 @@ class LimelightBaseIE(InfoExtractor):
     _PLAYLIST_SERVICE_URL = 'http://production-ps.lvp.llnw.net/r/PlaylistService/%s/%s/%s'
     _API_URL = 'http://api.video.limelight.com/rest/organizations/%s/%s/%s/%s.json'
 
+    @classmethod
+    def _extract_urls(cls, webpage, source_url):
+        lm = {
+            'Media': 'media',
+            'Channel': 'channel',
+            'ChannelList': 'channel_list',
+        }
+        entries = []
+        for kind, video_id in re.findall(
+                r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})',
+                webpage):
+            print('video_id', video_id)
+            entries.append(cls.url_result(
+                smuggle_url(
+                    'limelight:%s:%s' % (lm[kind], video_id),
+                    {'source_url': source_url}),
+                'Limelight%s' % kind, video_id))
+        for mobj in re.finditer(
+                # As per [1] class attribute should be exactly equal to
+                # LimelightEmbeddedPlayerFlash but numerous examples seen
+                # that don't exactly match it (e.g. [2]).
+                # 1. http://support.3playmedia.com/hc/en-us/articles/227732408-Limelight-Embedding-the-Captions-Plugin-with-the-Limelight-Player-on-Your-Webpage
+                # 2. http://www.sedona.com/FacilitatorTraining2017
+                r'''(?sx)
+                    <object[^>]+class=(["\'])(?:(?!\1).)*\bLimelightEmbeddedPlayerFlash\b(?:(?!\1).)*\1[^>]*>.*?
+                        <param[^>]+
+                            name=(["\'])flashVars\2[^>]+
+                            value=(["\'])(?:(?!\3).)*mediaId=(?P<id>[a-z0-9]{32})
+                ''', webpage):
+            entries.append(cls.url_result(
+                smuggle_url(
+                    'limelight:media:%s' % mobj.group('id'),
+                    {'source_url': source_url}),
+                'LimelightMedia', mobj.group('id')))
+        return entries
+
     def _call_playlist_service(self, item_id, method, fatal=True, referer=None):
         headers = {}
         if referer: