[amp] Add generic extractor for Akamai AMP feeds and use it in dramafever and foxnews...
authorremitamine <remitamine@gmail.com>
Sat, 7 Nov 2015 15:54:35 +0000 (16:54 +0100)
committerremitamine <remitamine@gmail.com>
Sat, 7 Nov 2015 15:54:35 +0000 (16:54 +0100)
youtube_dl/extractor/amp.py [new file with mode: 0644]
youtube_dl/extractor/dramafever.py
youtube_dl/extractor/foxnews.py

diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py
new file mode 100644 (file)
index 0000000..b573b92
--- /dev/null
@@ -0,0 +1,84 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    parse_iso8601,
+)
+
+
+class AMPIE(InfoExtractor):
+    def _get_media_node(self, item, name, default=None):
+        media_name = 'media-%s' % name
+        media_group = item.get('media-group') or item
+        return media_group.get(media_name) or item.get(media_name) or item.get(name, default)
+
+    # parse Akamai Adaptive Media Player feed
+    def _extract_feed_info(self, url):
+        item = self._download_json(
+            url, None,
+            'Downloading Akamai AMP feed',
+            'Unable to download Akamai AMP feed'
+            )['channel']['item']
+
+        video_id = item['guid']
+        
+        thumbnails = []
+        media_thumbnail = self._get_media_node(item, 'thumbnail')
+        if media_thumbnail:
+            if isinstance(media_thumbnail, dict):
+                media_thumbnail = [media_thumbnail]
+            for thumbnail_data in media_thumbnail:
+                thumbnail = thumbnail_data['@attributes']
+                thumbnails.append({
+                    'url': self._proto_relative_url(thumbnail['url'], 'http:'),
+                    'width': int_or_none(thumbnail.get('width')),
+                    'height': int_or_none(thumbnail.get('height')),
+                })
+
+        subtitles = {}
+        media_subtitle = self._get_media_node(item, 'subTitle')
+        if media_subtitle:
+            if isinstance(media_subtitle, dict):
+                media_subtitle = [media_subtitle]
+            for subtitle_data in media_subtitle:
+                subtitle = subtitle_data['@attributes']
+                lang = subtitle.get('lang') or 'en'
+                subtitles[lang] = [{'url': subtitle['href']}]
+
+        formats = []
+        media_content = self._get_media_node(item, 'content')
+        if isinstance(media_content, dict):
+            media_content = [media_content]
+        for media_data in media_content:
+            media = media_data['@attributes']
+            media_type = media['type']
+            if media_type == 'video/f4m':
+                f4m_formats = self._extract_f4m_formats(media['url'] + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124', video_id, f4m_id='hds', fatal=False)
+                if f4m_formats:
+                    formats.extend(f4m_formats)
+            elif media_type == 'application/x-mpegURL':
+                m3u8_formats = self._extract_m3u8_formats(media['url'], video_id, m3u8_id='hls', fatal=False)
+                if m3u8_formats:
+                    formats.extend(m3u8_formats)
+            else:
+                formats.append({
+                    'format_id': media_data['media-category']['@attributes']['label'],
+                    'url': media['url'],
+                    'preference': 1,
+                    'vbr': int_or_none(media.get('bitrate')),
+                    'filesize': int_or_none(media.get('fileSize')),
+                })
+
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'title': self._get_media_node(item, 'title'),
+            'description': self._get_media_node(item, 'description'),
+            'thumbnails': thumbnails,
+            'timestamp': parse_iso8601(item.get('pubDate'), ' '),
+            'duration': int_or_none(media_content[0].get('@attributes', {}).get('duration')),
+            'formats': formats,
+        }
index 38e6597c80f203b30a90a13c92027a4a5a305bd7..80a928827c4ef768eaa89bd2ac86e22823e9e43d 100644 (file)
@@ -3,7 +3,7 @@ from __future__ import unicode_literals
 
 import itertools
 
-from .common import InfoExtractor
+from .amp import AMPIE
 from ..compat import (
     compat_HTTPError,
     compat_urllib_parse,
@@ -19,7 +19,7 @@ from ..utils import (
 )
 
 
-class DramaFeverBaseIE(InfoExtractor):
+class DramaFeverBaseIE(AMPIE):
     _LOGIN_URL = 'https://www.dramafever.com/accounts/login/'
     _NETRC_MACHINE = 'dramafever'
 
@@ -80,60 +80,24 @@ class DramaFeverIE(DramaFeverBaseIE):
             'timestamp': 1404336058,
             'upload_date': '20140702',
             'duration': 343,
-        }
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
     }
 
     def _real_extract(self, url):
         video_id = self._match_id(url).replace('/', '.')
 
         try:
-            feed = self._download_json(
-                'http://www.dramafever.com/amp/episode/feed.json?guid=%s' % video_id,
-                video_id, 'Downloading episode JSON')['channel']['item']
+            info = self._extract_feed_info('http://www.dramafever.com/amp/episode/feed.json?guid=%s' % video_id)
         except ExtractorError as e:
             if isinstance(e.cause, compat_HTTPError):
                 raise ExtractorError(
                     'Currently unavailable in your country.', expected=True)
             raise
 
-        media_group = feed.get('media-group', {})
-
-        formats = []
-        for media_content in media_group['media-content']:
-            src = media_content.get('@attributes', {}).get('url')
-            if not src:
-                continue
-            ext = determine_ext(src)
-            if ext == 'f4m':
-                formats.extend(self._extract_f4m_formats(
-                    src, video_id, f4m_id='hds'))
-            elif ext == 'm3u8':
-                formats.extend(self._extract_m3u8_formats(
-                    src, video_id, 'mp4', m3u8_id='hls'))
-            else:
-                formats.append({
-                    'url': src,
-                })
-        self._sort_formats(formats)
-
-        title = media_group.get('media-title')
-        description = media_group.get('media-description')
-        duration = int_or_none(media_group['media-content'][0].get('@attributes', {}).get('duration'))
-        thumbnail = self._proto_relative_url(
-            media_group.get('media-thumbnail', {}).get('@attributes', {}).get('url'))
-        timestamp = parse_iso8601(feed.get('pubDate'), ' ')
-
-        subtitles = {}
-        for media_subtitle in media_group.get('media-subTitle', []):
-            lang = media_subtitle.get('@attributes', {}).get('lang')
-            href = media_subtitle.get('@attributes', {}).get('href')
-            if not lang or not href:
-                continue
-            subtitles[lang] = [{
-                'ext': 'ttml',
-                'url': href,
-            }]
-
         series_id, episode_number = video_id.split('.')
         episode_info = self._download_json(
             # We only need a single episode info, so restricting page size to one episode
@@ -146,21 +110,12 @@ class DramaFeverIE(DramaFeverBaseIE):
             if value:
                 subfile = value[0].get('subfile') or value[0].get('new_subfile')
                 if subfile and subfile != 'http://www.dramafever.com/st/':
-                    subtitles.setdefault('English', []).append({
+                    info['subtitiles'].setdefault('English', []).append({
                         'ext': 'srt',
                         'url': subfile,
                     })
 
-        return {
-            'id': video_id,
-            'title': title,
-            'description': description,
-            'thumbnail': thumbnail,
-            'timestamp': timestamp,
-            'duration': duration,
-            'formats': formats,
-            'subtitles': subtitles,
-        }
+        return info
 
 
 class DramaFeverSeriesIE(DramaFeverBaseIE):
index 3a4a59135da5b8b813090fffaf3f7cb9477f3743..0cd0f9fa83a5a79c1ded752353b81b2880c838ab 100644 (file)
@@ -2,14 +2,14 @@ from __future__ import unicode_literals
 
 import re
 
-from .common import InfoExtractor
+from .amp import AMPIE
 from ..utils import (
     parse_iso8601,
     int_or_none,
 )
 
 
-class FoxNewsIE(InfoExtractor):
+class FoxNewsIE(AMPIE):
     IE_DESC = 'Fox News and Fox Business Video'
     _VALID_URL = r'https?://(?P<host>video\.fox(?:news|business)\.com)/v/(?:video-embed\.html\?video_id=)?(?P<id>\d+)'
     _TESTS = [
@@ -20,10 +20,10 @@ class FoxNewsIE(InfoExtractor):
                 'id': '3937480',
                 'ext': 'flv',
                 'title': 'Frozen in Time',
-                'description': 'Doctors baffled by 16-year-old girl that is the size of a toddler',
+                'description': '16-year-old girl is size of toddler',
                 'duration': 265,
-                'timestamp': 1304411491,
-                'upload_date': '20110503',
+                #'timestamp': 1304411491,
+                #'upload_date': '20110503',
                 'thumbnail': 're:^https?://.*\.jpg$',
             },
         },
@@ -34,10 +34,10 @@ class FoxNewsIE(InfoExtractor):
                 'id': '3922535568001',
                 'ext': 'mp4',
                 'title': "Rep. Luis Gutierrez on if Obama's immigration plan is legal",
-                'description': "Congressman discusses the president's executive action",
+                'description': "Congressman discusses president's plan",
                 'duration': 292,
-                'timestamp': 1417662047,
-                'upload_date': '20141204',
+                #'timestamp': 1417662047,
+                #'upload_date': '20141204',
                 'thumbnail': 're:^https?://.*\.jpg$',
             },
         },
@@ -56,48 +56,6 @@ class FoxNewsIE(InfoExtractor):
         video_id = mobj.group('id')
         host = mobj.group('host')
 
-        video = self._download_json(
-            'http://%s/v/feed/video/%s.js?template=fox' % (host, video_id), video_id)
-
-        item = video['channel']['item']
-        title = item['title']
-        description = item['description']
-        timestamp = parse_iso8601(item['dc-date'])
-
-        media_group = item['media-group']
-        duration = None
-        formats = []
-        for media in media_group['media-content']:
-            attributes = media['@attributes']
-            video_url = attributes['url']
-            if video_url.endswith('.f4m'):
-                formats.extend(self._extract_f4m_formats(video_url + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124', video_id))
-            elif video_url.endswith('.m3u8'):
-                formats.extend(self._extract_m3u8_formats(video_url, video_id, 'flv'))
-            elif not video_url.endswith('.smil'):
-                duration = int_or_none(attributes.get('duration'))
-                formats.append({
-                    'url': video_url,
-                    'format_id': media['media-category']['@attributes']['label'],
-                    'preference': 1,
-                    'vbr': int_or_none(attributes.get('bitrate')),
-                    'filesize': int_or_none(attributes.get('fileSize'))
-                })
-        self._sort_formats(formats)
-
-        media_thumbnail = media_group['media-thumbnail']['@attributes']
-        thumbnails = [{
-            'url': media_thumbnail['url'],
-            'width': int_or_none(media_thumbnail.get('width')),
-            'height': int_or_none(media_thumbnail.get('height')),
-        }] if media_thumbnail else []
-
-        return {
-            'id': video_id,
-            'title': title,
-            'description': description,
-            'duration': duration,
-            'timestamp': timestamp,
-            'formats': formats,
-            'thumbnails': thumbnails,
-        }
+        info = self._extract_feed_info('http://%s/v/feed/video/%s.js?template=fox' % (host, video_id))
+        info['id'] = video_id
+        return info