[sendia] fix info extraction
authorRemita Amine <remitamine@gmail.com>
Tue, 5 Jul 2016 12:37:46 +0000 (13:37 +0100)
committerRemita Amine <remitamine@gmail.com>
Tue, 5 Jul 2016 12:37:46 +0000 (13:37 +0100)
youtube_dl/extractor/sandia.py

index 759898a492f43c67179409c563be42e864deae5f..9ab4d20a71b8451452db09bee32224be79ef1160 100644 (file)
@@ -27,7 +27,8 @@ class SandiaIE(InfoExtractor):
             'ext': 'mp4',
             'title': 'Xyce Software Training - Section 1',
             'description': 're:(?s)SAND Number: SAND 2013-7800.{200,}',
-            'upload_date': '20120904',
+            'upload_date': '20120409',
+            'timestamp': 1333983600,
             'duration': 7794,
         }
     }
@@ -35,81 +36,36 @@ class SandiaIE(InfoExtractor):
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        req = sanitized_Request(url)
-        req.add_header('Cookie', 'MediasitePlayerCaps=ClientPlugins=4')
-        webpage = self._download_webpage(req, video_id)
+        presentation_data = self._download_json(
+            'http://digitalops.sandia.gov/Mediasite/PlayerService/PlayerService.svc/json/GetPlayerOptions',
+            video_id, data=json.dumps({
+                'getPlayerOptionsRequest': {
+                    'ResourceId': video_id,
+                    'QueryString': '',
+                }
+            }), headers={
+                'Content-Type': 'application/json; charset=utf-8',
+            })['d']['Presentation']
 
-        js_path = self._search_regex(
-            r'<script type="text/javascript" src="(/Mediasite/FileServer/Presentation/[^"]+)"',
-            webpage, 'JS code URL')
-        js_url = compat_urlparse.urljoin(url, js_path)
-
-        js_code = self._download_webpage(
-            js_url, video_id, note='Downloading player')
-
-        def extract_str(key, **args):
-            return self._search_regex(
-                r'Mediasite\.PlaybackManifest\.%s\s*=\s*(.+);\s*?\n' % re.escape(key),
-                js_code, key, **args)
-
-        def extract_data(key, **args):
-            data_json = extract_str(key, **args)
-            if data_json is None:
-                return data_json
-            return self._parse_json(
-                data_json, video_id, transform_source=js_to_json)
+        title = presentation_data['Title']
 
         formats = []
-        for i in itertools.count():
-            fd = extract_data('VideoUrls[%d]' % i, default=None)
-            if fd is None:
-                break
-            formats.append({
-                'format_id': '%s' % i,
-                'format_note': fd['MimeType'].partition('/')[2],
-                'ext': mimetype2ext(fd['MimeType']),
-                'url': fd['Location'],
-                'protocol': 'f4m' if fd['MimeType'] == 'video/x-mp4-fragmented' else None,
-            })
+        for stream in presentation_data.get('Streams', []):
+            for fd in stream.get('VideoUrls', []):
+                formats.append({
+                    'format_id': fd['MediaType'],
+                    'format_note': fd['MimeType'].partition('/')[2],
+                    'ext': mimetype2ext(fd['MimeType']),
+                    'url': fd['Location'],
+                    'protocol': 'f4m' if fd['MimeType'] == 'video/x-mp4-fragmented' else None,
+                })
         self._sort_formats(formats)
 
-        slide_baseurl = compat_urlparse.urljoin(
-            url, extract_data('SlideBaseUrl'))
-        slide_template = slide_baseurl + re.sub(
-            r'\{0:D?([0-9+])\}', r'%0\1d', extract_data('SlideImageFileNameTemplate'))
-        slides = []
-        last_slide_time = 0
-        for i in itertools.count(1):
-            sd = extract_str('Slides[%d]' % i, default=None)
-            if sd is None:
-                break
-            timestamp = int_or_none(self._search_regex(
-                r'^Mediasite\.PlaybackManifest\.CreateSlide\("[^"]*"\s*,\s*([0-9]+),',
-                sd, 'slide %s timestamp' % i, fatal=False))
-            slides.append({
-                'url': slide_template % i,
-                'duration': timestamp - last_slide_time,
-            })
-            last_slide_time = timestamp
-        formats.append({
-            'format_id': 'slides',
-            'protocol': 'slideshow',
-            'url': json.dumps(slides),
-            'preference': -10000,  # Downloader not yet written
-        })
-        self._sort_formats(formats)
-
-        title = extract_data('Title')
-        description = extract_data('Description', fatal=False)
-        duration = int_or_none(extract_data(
-            'Duration', fatal=False), scale=1000)
-        upload_date = unified_strdate(extract_data('AirDate', fatal=False))
-
         return {
             'id': video_id,
             'title': title,
-            'description': description,
+            'description': presentation_data.get('Description'),
             'formats': formats,
-            'upload_date': upload_date,
-            'duration': duration,
+            'timestamp': int_or_none(presentation_data.get('UnixTime'), 1000),
+            'duration': int_or_none(presentation_data.get('Duration'), 1000),
         }