Merge branch 'charlierose' of https://github.com/TRox1972/youtube-dl into TRox1972...
authorYen Chi Hsuan <yan12125@gmail.com>
Sun, 21 Aug 2016 16:48:35 +0000 (00:48 +0800)
committerYen Chi Hsuan <yan12125@gmail.com>
Sun, 21 Aug 2016 16:48:35 +0000 (00:48 +0800)
README.md
youtube_dl/YoutubeDL.py
youtube_dl/extractor/cbs.py
youtube_dl/extractor/cbsnews.py
youtube_dl/extractor/cbssports.py
youtube_dl/extractor/cnn.py
youtube_dl/extractor/cultureunplugged.py
youtube_dl/extractor/dotsub.py
youtube_dl/extractor/imdb.py
youtube_dl/extractor/kaltura.py

index 952db7abb8aff25473544d9827ef7b9b3060cc8d..a10aaf35c6733ae308c2309539110304ba25049f 100644 (file)
--- a/README.md
+++ b/README.md
@@ -645,7 +645,11 @@ $ youtube-dl -f 'best[filesize<50M]'
 
 # Download best format available via direct link over HTTP/HTTPS protocol
 $ youtube-dl -f '(bestvideo+bestaudio/best)[protocol^=http]'
+
+# Download the best video format and the best audio format without merging them
+$ youtube-dl -f 'bestvideo,bestaudio' -o '%(title)s.f%(format_id)s.%(ext)s'
 ```
+Note that in the last example, an output template is recommended as bestvideo and bestaudio may have the same file name.
 
 
 # VIDEO SELECTION
index e844dc98a5b3915070ffae079395233de7ed04f7..0b3e3da823c13351abce111be2dd8ffa105212dc 100755 (executable)
@@ -1299,7 +1299,7 @@ class YoutubeDL(object):
                 for subtitle_format in subtitle:
                     if subtitle_format.get('url'):
                         subtitle_format['url'] = sanitize_url(subtitle_format['url'])
-                    if 'ext' not in subtitle_format:
+                    if subtitle_format.get('ext') is None:
                         subtitle_format['ext'] = determine_ext(subtitle_format['url']).lower()
 
         if self.params.get('listsubtitles', False):
@@ -1354,7 +1354,7 @@ class YoutubeDL(object):
                     note=' ({0})'.format(format['format_note']) if format.get('format_note') is not None else '',
                 )
             # Automatically determine file extension if missing
-            if 'ext' not in format:
+            if format.get('ext') is None:
                 format['ext'] = determine_ext(format['url']).lower()
             # Automatically determine protocol if missing (useful for format
             # selection purposes)
index a23173d6f1a9570225242692ee74d68fc061fb3d..c72ed2dbb7d2f6c97d1cca1e8652451db91e49ee 100644 (file)
@@ -4,6 +4,7 @@ from .theplatform import ThePlatformFeedIE
 from ..utils import (
     int_or_none,
     find_xpath_attr,
+    ExtractorError,
 )
 
 
@@ -17,19 +18,6 @@ class CBSBaseIE(ThePlatformFeedIE):
             }]
         } if closed_caption_e is not None and closed_caption_e.attrib.get('value') else []
 
-    def _extract_video_info(self, filter_query, video_id):
-        return self._extract_feed_info(
-            'dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id, lambda entry: {
-                'series': entry.get('cbs$SeriesTitle'),
-                'season_number': int_or_none(entry.get('cbs$SeasonNumber')),
-                'episode': entry.get('cbs$EpisodeTitle'),
-                'episode_number': int_or_none(entry.get('cbs$EpisodeNumber')),
-            }, {
-                'StreamPack': {
-                    'manifest': 'm3u',
-                }
-            })
-
 
 class CBSIE(CBSBaseIE):
     _VALID_URL = r'(?:cbs:|https?://(?:www\.)?(?:cbs\.com/shows/[^/]+/video|colbertlateshow\.com/(?:video|podcasts))/)(?P<id>[\w-]+)'
@@ -38,7 +26,6 @@ class CBSIE(CBSBaseIE):
         'url': 'http://www.cbs.com/shows/garth-brooks/video/_u7W953k6la293J7EPTd9oHkSPs6Xn6_/connect-chat-feat-garth-brooks/',
         'info_dict': {
             'id': '_u7W953k6la293J7EPTd9oHkSPs6Xn6_',
-            'display_id': 'connect-chat-feat-garth-brooks',
             'ext': 'mp4',
             'title': 'Connect Chat feat. Garth Brooks',
             'description': 'Connect with country music singer Garth Brooks, as he chats with fans on Wednesday November 27, 2013. Be sure to tune in to Garth Brooks: Live from Las Vegas, Friday November 29, at 9/8c on CBS!',
@@ -47,7 +34,10 @@ class CBSIE(CBSBaseIE):
             'upload_date': '20131127',
             'uploader': 'CBSI-NEW',
         },
-        'expected_warnings': ['Failed to download m3u8 information'],
+        'params': {
+            # m3u8 download
+            'skip_download': True,
+        },
         '_skip': 'Blocked outside the US',
     }, {
         'url': 'http://colbertlateshow.com/video/8GmB0oY0McANFvp2aEffk9jZZZ2YyXxy/the-colbeard/',
@@ -56,8 +46,31 @@ class CBSIE(CBSBaseIE):
         'url': 'http://www.colbertlateshow.com/podcasts/dYSwjqPs_X1tvbV_P2FcPWRa_qT6akTC/in-the-bad-room-with-stephen/',
         'only_matching': True,
     }]
-    TP_RELEASE_URL_TEMPLATE = 'http://link.theplatform.com/s/dJ5BDC/%s?mbr=true'
+
+    def _extract_video_info(self, guid):
+        path = 'dJ5BDC/media/guid/2198311517/' + guid
+        smil_url = 'http://link.theplatform.com/s/%s?mbr=true' % path
+        formats, subtitles = self._extract_theplatform_smil(smil_url + '&manifest=m3u', guid)
+        for r in ('HLS&formats=M3U', 'RTMP', 'WIFI', '3G'):
+            try:
+                tp_formats, _ = self._extract_theplatform_smil(smil_url + '&assetTypes=' + r, guid, 'Downloading %s SMIL data' % r.split('&')[0])
+                formats.extend(tp_formats)
+            except ExtractorError:
+                continue
+        self._sort_formats(formats)
+        metadata = self._download_theplatform_metadata(path, guid)
+        info = self._parse_theplatform_metadata(metadata)
+        info.update({
+            'id': guid,
+            'formats': formats,
+            'subtitles': subtitles,
+            'series': metadata.get('cbs$SeriesTitle'),
+            'season_number': int_or_none(metadata.get('cbs$SeasonNumber')),
+            'episode': metadata.get('cbs$EpisodeTitle'),
+            'episode_number': int_or_none(metadata.get('cbs$EpisodeNumber')),
+        })
+        return info
 
     def _real_extract(self, url):
         content_id = self._match_id(url)
-        return self._extract_video_info('byGuid=%s' % content_id, content_id)
+        return self._extract_video_info(content_id)
index 9d3b75526395ee946ef660c14e0629019e10c1e2..4aa6917a0b494df6465d810b237040f4a7f30c78 100644 (file)
@@ -2,13 +2,13 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
-from .cbs import CBSBaseIE
+from .cbs import CBSIE
 from ..utils import (
     parse_duration,
 )
 
 
-class CBSNewsIE(CBSBaseIE):
+class CBSNewsIE(CBSIE):
     IE_DESC = 'CBS News'
     _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/(?:news|videos)/(?P<id>[\da-z_-]+)'
 
@@ -35,7 +35,8 @@ class CBSNewsIE(CBSBaseIE):
                 'ext': 'mp4',
                 'title': 'Fort Hood shooting: Army downplays mental illness as cause of attack',
                 'description': 'md5:4a6983e480542d8b333a947bfc64ddc7',
-                'upload_date': '19700101',
+                'upload_date': '20140404',
+                'timestamp': 1396650660,
                 'uploader': 'CBSI-NEW',
                 'thumbnail': 're:^https?://.*\.jpg$',
                 'duration': 205,
@@ -63,7 +64,7 @@ class CBSNewsIE(CBSBaseIE):
 
         item = video_info['item'] if 'item' in video_info else video_info
         guid = item['mpxRefId']
-        return self._extract_video_info('byGuid=%s' % guid, guid)
+        return self._extract_video_info(guid)
 
 
 class CBSNewsLiveVideoIE(InfoExtractor):
index 78ca44b024bfb20dc6ce79e4ee51f3472d599711..bf7915626688787e9791dfd21c469f5bc5bed104 100644 (file)
@@ -23,6 +23,9 @@ class CBSSportsIE(CBSBaseIE):
         }
     }]
 
+    def _extract_video_info(self, filter_query, video_id):
+        return self._extract_feed_info('dJ5BDC', 'VxxJg8Ymh8sE', filter_query, video_id)
+
     def _real_extract(self, url):
         video_id = self._match_id(url)
         return self._extract_video_info('byId=%s' % video_id, video_id)
index 53489a14e38399680c8338f4f22a521f7fa6ad45..220bb55e8f4ea417481f3f207c4112d8c9e13e17 100644 (file)
@@ -11,7 +11,7 @@ from ..utils import (
 
 
 class CNNIE(InfoExtractor):
-    _VALID_URL = r'''(?x)https?://(?:(?:edition|www)\.)?cnn\.com/video/(?:data/.+?|\?)/
+    _VALID_URL = r'''(?x)https?://(?:(?P<sub_domain>edition|www|money)\.)?cnn\.com/(?:video/(?:data/.+?|\?)/)?videos?/
         (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z\-]+)|(?=&)))'''
 
     _TESTS = [{
@@ -45,19 +45,46 @@ class CNNIE(InfoExtractor):
             'description': 'md5:e7223a503315c9f150acac52e76de086',
             'upload_date': '20141222',
         }
+    }, {
+        'url': 'http://money.cnn.com/video/news/2016/08/19/netflix-stunning-stats.cnnmoney/index.html',
+        'md5': '52a515dc1b0f001cd82e4ceda32be9d1',
+        'info_dict': {
+            'id': '/video/news/2016/08/19/netflix-stunning-stats.cnnmoney',
+            'ext': 'mp4',
+            'title': '5 stunning stats about Netflix',
+            'description': 'Did you know that Netflix has more than 80 million members? Here are five facts about the online video distributor that you probably didn\'t know.',
+            'upload_date': '20160819',
+        }
     }, {
         'url': 'http://cnn.com/video/?/video/politics/2015/03/27/pkg-arizona-senator-church-attendance-mandatory.ktvk',
         'only_matching': True,
     }, {
         'url': 'http://cnn.com/video/?/video/us/2015/04/06/dnt-baker-refuses-anti-gay-order.wkmg',
         'only_matching': True,
+    }, {
+        'url': 'http://edition.cnn.com/videos/arts/2016/04/21/olympic-games-cultural-a-z-brazil.cnn',
+        'only_matching': True,
     }]
 
+    _CONFIG = {
+        # http://edition.cnn.com/.element/apps/cvp/3.0/cfg/spider/cnn/expansion/config.xml
+        'edition': {
+            'data_src': 'http://edition.cnn.com/video/data/3.0/video/%s/index.xml',
+            'media_src': 'http://pmd.cdn.turner.com/cnn/big',
+        },
+        # http://money.cnn.com/.element/apps/cvp2/cfg/config.xml
+        'money': {
+            'data_src': 'http://money.cnn.com/video/data/4.0/video/%s.xml',
+            'media_src': 'http://ht3.cdn.turner.com/money/big',
+        },
+    }
+
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        path = mobj.group('path')
-        page_title = mobj.group('title')
-        info_url = 'http://edition.cnn.com/video/data/3.0/%s/index.xml' % path
+        sub_domain, path, page_title = re.match(self._VALID_URL, url).groups()
+        if sub_domain not in ('money', 'edition'):
+            sub_domain = 'edition'
+        config = self._CONFIG[sub_domain]
+        info_url = config['data_src'] % path
         info = self._download_xml(info_url, page_title)
 
         formats = []
@@ -66,7 +93,7 @@ class CNNIE(InfoExtractor):
             (?:_(?P<bitrate>[0-9]+)k)?
         ''')
         for f in info.findall('files/file'):
-            video_url = 'http://ht.cdn.turner.com/cnn/big%s' % (f.text.strip())
+            video_url = config['media_src'] + f.text.strip()
             fdct = {
                 'format_id': f.attrib['bitrate'],
                 'url': video_url,
@@ -146,7 +173,7 @@ class CNNBlogsIE(InfoExtractor):
 
 
 class CNNArticleIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:(?:edition|www)\.)?cnn\.com/(?!video/)'
+    _VALID_URL = r'https?://(?:(?:edition|www)\.)?cnn\.com/(?!videos?/)'
     _TEST = {
         'url': 'http://www.cnn.com/2014/12/21/politics/obama-north-koreas-hack-not-war-but-cyber-vandalism/',
         'md5': '689034c2a3d9c6dc4aa72d65a81efd01',
index 9c764fe68c57314d8524b2705f8bae7c30520c26..9f26fa5878777d3302383646ad581056f429841a 100644 (file)
@@ -1,9 +1,13 @@
 from __future__ import unicode_literals
 
 import re
+import time
 
 from .common import InfoExtractor
-from ..utils import int_or_none
+from ..utils import (
+    int_or_none,
+    HEADRequest,
+)
 
 
 class CultureUnpluggedIE(InfoExtractor):
@@ -32,6 +36,9 @@ class CultureUnpluggedIE(InfoExtractor):
         video_id = mobj.group('id')
         display_id = mobj.group('display_id') or video_id
 
+        # request setClientTimezone.php to get PHPSESSID cookie which is need to get valid json data in the next request
+        self._request_webpage(HEADRequest(
+            'http://www.cultureunplugged.com/setClientTimezone.php?timeOffset=%d' % -(time.timezone / 3600)), display_id)
         movie_data = self._download_json(
             'http://www.cultureunplugged.com/movie-data/cu-%s.json' % video_id, display_id)
 
index e9ca236d4a03c13b1b29b3386535c4262332dab0..fd64d1a7f1bb85e454657efe2656c1bb62a01f1c 100644 (file)
@@ -10,18 +10,18 @@ from ..utils import (
 class DotsubIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?dotsub\.com/view/(?P<id>[^/]+)'
     _TEST = {
-        'url': 'http://dotsub.com/view/aed3b8b2-1889-4df5-ae63-ad85f5572f27',
-        'md5': '0914d4d69605090f623b7ac329fea66e',
+        'url': 'https://dotsub.com/view/9c63db2a-fa95-4838-8e6e-13deafe47f09',
+        'md5': '21c7ff600f545358134fea762a6d42b6',
         'info_dict': {
-            'id': 'aed3b8b2-1889-4df5-ae63-ad85f5572f27',
+            'id': '9c63db2a-fa95-4838-8e6e-13deafe47f09',
             'ext': 'flv',
-            'title': 'Pyramids of Waste (2010), AKA The Lightbulb Conspiracy - Planned obsolescence documentary',
-            'description': 'md5:699a0f7f50aeec6042cb3b1db2d0d074',
-            'thumbnail': 're:^https?://dotsub.com/media/aed3b8b2-1889-4df5-ae63-ad85f5572f27/p',
-            'duration': 3169,
-            'uploader': '4v4l0n42',
-            'timestamp': 1292248482.625,
-            'upload_date': '20101213',
+            'title': 'MOTIVATION - "It\'s Possible" Best Inspirational Video Ever',
+            'description': 'md5:41af1e273edbbdfe4e216a78b9d34ac6',
+            'thumbnail': 're:^https?://dotsub.com/media/9c63db2a-fa95-4838-8e6e-13deafe47f09/p',
+            'duration': 198,
+            'uploader': 'liuxt',
+            'timestamp': 1385778501.104,
+            'upload_date': '20131130',
             'view_count': int,
         }
     }
index 0acce9f4c2525a62e3d3ad22b16743737dbb5b07..3a6a6f5ad8bddbf4e4ebcc2dbc7974f2e71ef32e 100644 (file)
@@ -6,6 +6,7 @@ from .common import InfoExtractor
 from ..utils import (
     mimetype2ext,
     qualities,
+    remove_end,
 )
 
 
@@ -19,7 +20,7 @@ class ImdbIE(InfoExtractor):
         'info_dict': {
             'id': '2524815897',
             'ext': 'mp4',
-            'title': 'Ice Age: Continental Drift Trailer (No. 2) - IMDb',
+            'title': 'Ice Age: Continental Drift Trailer (No. 2)',
             'description': 'md5:9061c2219254e5d14e03c25c98e96a81',
         }
     }, {
@@ -83,10 +84,10 @@ class ImdbIE(InfoExtractor):
 
         return {
             'id': video_id,
-            'title': self._og_search_title(webpage),
+            'title': remove_end(self._og_search_title(webpage), ' - IMDb'),
             'formats': formats,
             'description': descr,
-            'thumbnail': format_info['slate'],
+            'thumbnail': format_info.get('slate'),
         }
 
 
index ddf1165ffb021005119622d3f162cbce9c637b55..e0f7366c2ec2eaf1cc507d850b46b49a80044d9d 100644 (file)
@@ -67,6 +67,27 @@ class KalturaIE(InfoExtractor):
             # video with subtitles
             'url': 'kaltura:111032:1_cw786r8q',
             'only_matching': True,
+        },
+        {
+            # video with ttml subtitles (no fileExt)
+            'url': 'kaltura:1926081:0_l5ye1133',
+            'info_dict': {
+                'id': '0_l5ye1133',
+                'ext': 'mp4',
+                'title': 'What Can You Do With Python?',
+                'upload_date': '20160221',
+                'uploader_id': 'stork',
+                'thumbnail': 're:^https?://.*/thumbnail/.*',
+                'timestamp': int,
+                'subtitles': {
+                    'en': [{
+                        'ext': 'ttml',
+                    }],
+                },
+            },
+            'params': {
+                'skip_download': True,
+            },
         }
     ]
 
@@ -122,18 +143,6 @@ class KalturaIE(InfoExtractor):
 
         return data
 
-    def _get_kaltura_signature(self, video_id, partner_id, service_url=None):
-        actions = [{
-            'apiVersion': '3.1',
-            'expiry': 86400,
-            'format': 1,
-            'service': 'session',
-            'action': 'startWidgetSession',
-            'widgetId': '_%s' % partner_id,
-        }]
-        return self._kaltura_api_call(
-            video_id, actions, service_url, note='Downloading Kaltura signature')['ks']
-
     def _get_video_info(self, video_id, partner_id, service_url=None):
         actions = [
             {
@@ -208,6 +217,17 @@ class KalturaIE(InfoExtractor):
                     reference_id)['entryResult']
                 info, flavor_assets = entry_data['meta'], entry_data['contextData']['flavorAssets']
                 entry_id = info['id']
+                # Unfortunately, data returned in kalturaIframePackageData lacks
+                # captions so we will try requesting the complete data using
+                # regular approach since we now know the entry_id
+                try:
+                    _, info, flavor_assets, captions = self._get_video_info(
+                        entry_id, partner_id)
+                except ExtractorError:
+                    # Regular scenario failed but we already have everything
+                    # extracted apart from captions and can process at least
+                    # with this
+                    pass
             else:
                 raise ExtractorError('Invalid URL', expected=True)
             ks = params.get('flashvars[ks]', [None])[0]
@@ -267,7 +287,7 @@ class KalturaIE(InfoExtractor):
                     continue
                 subtitles.setdefault(caption.get('languageCode') or caption.get('language'), []).append({
                     'url': '%s/api_v3/service/caption_captionasset/action/serve/captionAssetId/%s' % (self._SERVICE_URL, caption['id']),
-                    'ext': caption.get('fileExt'),
+                    'ext': caption.get('fileExt', 'ttml'),
                 })
 
         return {