]> git.bitcoin.ninja Git - youtube-dl/commitdiff
Merge pull request #8246 from dstftw/initial-json-ld-metadata-support
authorSergey M <dstftw@gmail.com>
Sat, 16 Jan 2016 01:20:15 +0000 (07:20 +0600)
committerSergey M <dstftw@gmail.com>
Sat, 16 Jan 2016 01:20:15 +0000 (07:20 +0600)
Initial JSON-LD metadata extraction support

docs/supportedsites.md
youtube_dl/YoutubeDL.py
youtube_dl/extractor/__init__.py
youtube_dl/extractor/cultureunplugged.py [new file with mode: 0644]
youtube_dl/extractor/cwtv.py [new file with mode: 0644]
youtube_dl/extractor/zippcast.py [new file with mode: 0644]
youtube_dl/version.py

index eb160bd2fafac9d80b297fd0089ef4d966f44885..99b1e27317da98573803cd6acbdbb5789ea05eb5 100644 (file)
@@ -24,6 +24,7 @@
  - **AdobeTVShow**
  - **AdobeTVVideo**
  - **AdultSwim**
+ - **AE**
  - **Aftonbladet**
  - **AirMozilla**
  - **AlJazeera**
@@ -85,6 +86,7 @@
  - **CamdemyFolder**
  - **canalc2.tv**
  - **Canalplus**: canalplus.fr, piwiplus.fr and d8.tv
+ - **Canvas**
  - **CBS**
  - **CBSNews**: CBS News
  - **CBSSports**
  - **CSpan**: C-SPAN
  - **CtsNews**: 華視新聞
  - **culturebox.francetvinfo.fr**
+ - **CWTV**
  - **dailymotion**
  - **dailymotion:playlist**
  - **dailymotion:user**
  - **Helsinki**: helsinki.fi
  - **HentaiStigma**
  - **HistoricFilms**
- - **History**
  - **hitbox**
  - **hitbox:live**
  - **HornBunny**
index 6b73b8e06e0b53e5fb7764deaa349d8370f313dd..0748fbba0262cb29b90b73bba749d802f2b3bcc1 100755 (executable)
@@ -1244,6 +1244,12 @@ class YoutubeDL(object):
             except (ValueError, OverflowError, OSError):
                 pass
 
+        # Auto generate title fields corresponding to the *_number fields when missing
+        # in order to always have clean titles. This is very common for TV series.
+        for field in ('chapter', 'season', 'episode'):
+            if info_dict.get('%s_number' % field) is not None and not info_dict.get(field):
+                info_dict[field] = '%s %d' % (field.capitalize(), info_dict['%s_number' % field])
+
         subtitles = info_dict.get('subtitles')
         if subtitles:
             for _, subtitle in subtitles.items():
index 3975354da0d57f79c4209cf2ca35a3373e8fa07f..2c56797a50a833bb6783c946e2aa776cf9c87e07 100644 (file)
@@ -130,6 +130,8 @@ from .crunchyroll import (
 )
 from .cspan import CSpanIE
 from .ctsnews import CtsNewsIE
+from .cultureunplugged import CultureUnpluggedIE
+from .cwtv import CWTVIE
 from .dailymotion import (
     DailymotionIE,
     DailymotionPlaylistIE,
@@ -914,6 +916,7 @@ from .zingmp3 import (
     ZingMp3SongIE,
     ZingMp3AlbumIE,
 )
+from .zippcast import ZippCastIE
 
 _ALL_CLASSES = [
     klass
diff --git a/youtube_dl/extractor/cultureunplugged.py b/youtube_dl/extractor/cultureunplugged.py
new file mode 100644 (file)
index 0000000..9c764fe
--- /dev/null
@@ -0,0 +1,63 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class CultureUnpluggedIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?cultureunplugged\.com/documentary/watch-online/play/(?P<id>\d+)(?:/(?P<display_id>[^/]+))?'
+    _TESTS = [{
+        'url': 'http://www.cultureunplugged.com/documentary/watch-online/play/53662/The-Next--Best-West',
+        'md5': 'ac6c093b089f7d05e79934dcb3d228fc',
+        'info_dict': {
+            'id': '53662',
+            'display_id': 'The-Next--Best-West',
+            'ext': 'mp4',
+            'title': 'The Next, Best West',
+            'description': 'md5:0423cd00833dea1519cf014e9d0903b1',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'creator': 'Coldstream Creative',
+            'duration': 2203,
+            'view_count': int,
+        }
+    }, {
+        'url': 'http://www.cultureunplugged.com/documentary/watch-online/play/53662',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        video_id = mobj.group('id')
+        display_id = mobj.group('display_id') or video_id
+
+        movie_data = self._download_json(
+            'http://www.cultureunplugged.com/movie-data/cu-%s.json' % video_id, display_id)
+
+        video_url = movie_data['url']
+        title = movie_data['title']
+
+        description = movie_data.get('synopsis')
+        creator = movie_data.get('producer')
+        duration = int_or_none(movie_data.get('duration'))
+        view_count = int_or_none(movie_data.get('views'))
+
+        thumbnails = [{
+            'url': movie_data['%s_thumb' % size],
+            'id': size,
+            'preference': preference,
+        } for preference, size in enumerate((
+            'small', 'large')) if movie_data.get('%s_thumb' % size)]
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'url': video_url,
+            'title': title,
+            'description': description,
+            'creator': creator,
+            'duration': duration,
+            'view_count': view_count,
+            'thumbnails': thumbnails,
+        }
diff --git a/youtube_dl/extractor/cwtv.py b/youtube_dl/extractor/cwtv.py
new file mode 100644 (file)
index 0000000..36af670
--- /dev/null
@@ -0,0 +1,88 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    parse_iso8601,
+)
+
+
+class CWTVIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?cw(?:tv|seed)\.com/shows/(?:[^/]+/){2}\?play=(?P<id>[a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12})'
+    _TESTS = [{
+        'url': 'http://cwtv.com/shows/arrow/legends-of-yesterday/?play=6b15e985-9345-4f60-baf8-56e96be57c63',
+        'info_dict': {
+            'id': '6b15e985-9345-4f60-baf8-56e96be57c63',
+            'ext': 'mp4',
+            'title': 'Legends of Yesterday',
+            'description': 'Oliver and Barry Allen take Kendra Saunders and Carter Hall to a remote location to keep them hidden from Vandal Savage while they figure out how to defeat him.',
+            'duration': 2665,
+            'series': 'Arrow',
+            'season_number': 4,
+            'season': '4',
+            'episode_number': 8,
+            'upload_date': '20151203',
+            'timestamp': 1449122100,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        }
+    }, {
+        'url': 'http://www.cwseed.com/shows/whose-line-is-it-anyway/jeff-davis-4/?play=24282b12-ead2-42f2-95ad-26770c2c6088',
+        'info_dict': {
+            'id': '24282b12-ead2-42f2-95ad-26770c2c6088',
+            'ext': 'mp4',
+            'title': 'Jeff Davis 4',
+            'description': 'Jeff Davis is back to make you laugh.',
+            'duration': 1263,
+            'series': 'Whose Line Is It Anyway?',
+            'season_number': 11,
+            'season': '11',
+            'episode_number': 20,
+            'upload_date': '20151006',
+            'timestamp': 1444107300,
+        },
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        }
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        video_data = self._download_json(
+            'http://metaframe.digitalsmiths.tv/v2/CWtv/assets/%s/partner/132?format=json' % video_id, video_id)
+
+        formats = self._extract_m3u8_formats(
+            video_data['videos']['variantplaylist']['uri'], video_id, 'mp4')
+
+        thumbnails = [{
+            'url': image['uri'],
+            'width': image.get('width'),
+            'height': image.get('height'),
+        } for image_id, image in video_data['images'].items() if image.get('uri')] if video_data.get('images') else None
+
+        video_metadata = video_data['assetFields']
+
+        subtitles = {
+            'en': [{
+                'url': video_metadata['UnicornCcUrl'],
+            }],
+        } if video_metadata.get('UnicornCcUrl') else None
+
+        return {
+            'id': video_id,
+            'title': video_metadata['title'],
+            'description': video_metadata.get('description'),
+            'duration': int_or_none(video_metadata.get('duration')),
+            'series': video_metadata.get('seriesName'),
+            'season_number': int_or_none(video_metadata.get('seasonNumber')),
+            'season': video_metadata.get('seasonName'),
+            'episode_number': int_or_none(video_metadata.get('episodeNumber')),
+            'timestamp': parse_iso8601(video_data.get('startTime')),
+            'thumbnails': thumbnails,
+            'formats': formats,
+            'subtitles': subtitles,
+        }
diff --git a/youtube_dl/extractor/zippcast.py b/youtube_dl/extractor/zippcast.py
new file mode 100644 (file)
index 0000000..de81937
--- /dev/null
@@ -0,0 +1,94 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    str_to_int,
+)
+
+
+class ZippCastIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?zippcast\.com/(?:video/|videoview\.php\?.*\bvplay=)(?P<id>[0-9a-zA-Z]+)'
+    _TESTS = [{
+        # m3u8, hq direct link
+        'url': 'http://www.zippcast.com/video/c9cfd5c7e44dbc29c81',
+        'md5': '5ea0263b5606866c4d6cda0fc5e8c6b6',
+        'info_dict': {
+            'id': 'c9cfd5c7e44dbc29c81',
+            'ext': 'mp4',
+            'title': '[Vinesauce] Vinny - Digital Space Traveler',
+            'description': 'Muted on youtube, but now uploaded in it\'s original form.',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'uploader': 'vinesauce',
+            'view_count': int,
+            'categories': ['Entertainment'],
+            'tags': list,
+        },
+    }, {
+        # f4m, lq ipod direct link
+        'url': 'http://www.zippcast.com/video/b79c0a233e9c6581775',
+        'only_matching': True,
+    }, {
+        'url': 'http://www.zippcast.com/videoview.php?vplay=c9cfd5c7e44dbc29c81&auto=no',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(
+            'http://www.zippcast.com/video/%s' % video_id, video_id)
+
+        formats = []
+        video_url = self._search_regex(
+            r'<source[^>]+src=(["\'])(?P<url>.+?)\1', webpage,
+            'video url', default=None, group='url')
+        if video_url:
+            formats.append({
+                'url': video_url,
+                'format_id': 'http',
+                'preference': 0,  # direct link is almost always of worse quality
+            })
+        src_url = self._search_regex(
+            r'src\s*:\s*(?:escape\()?(["\'])(?P<url>http://.+?)\1',
+            webpage, 'src', default=None, group='url')
+        ext = determine_ext(src_url)
+        if ext == 'm3u8':
+            formats.extend(self._extract_m3u8_formats(
+                src_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                m3u8_id='hls', fatal=False))
+        elif ext == 'f4m':
+            formats.extend(self._extract_f4m_formats(
+                src_url, video_id, f4m_id='hds', fatal=False))
+        self._sort_formats(formats)
+
+        title = self._og_search_title(webpage)
+        description = self._og_search_description(webpage) or self._html_search_meta(
+            'description', webpage)
+        uploader = self._search_regex(
+            r'<a[^>]+href="https?://[^/]+/profile/[^>]+>([^<]+)</a>',
+            webpage, 'uploader', fatal=False)
+        thumbnail = self._og_search_thumbnail(webpage)
+        view_count = str_to_int(self._search_regex(
+            r'>([\d,.]+) views!', webpage, 'view count', fatal=False))
+
+        categories = re.findall(
+            r'<a[^>]+href="https?://[^/]+/categories/[^"]+">([^<]+),?<',
+            webpage)
+        tags = re.findall(
+            r'<a[^>]+href="https?://[^/]+/search/tags/[^"]+">([^<]+),?<',
+            webpage)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'uploader': uploader,
+            'view_count': view_count,
+            'categories': categories,
+            'tags': tags,
+            'formats': formats,
+        }
index 4d433b6678842a59b34563491e3c552c607b28b1..02c438f3a58cba68e519218fd0c1cc5b225d9c3a 100644 (file)
@@ -1,3 +1,3 @@
 from __future__ import unicode_literals
 
-__version__ = '2016.01.14'
+__version__ = '2016.01.15'