From: remitamine <remitamine@gmail.com>
Date: Sat, 6 Feb 2016 05:26:02 +0000 (+0100)
Subject: Merge pull request #8408 from remitamine/dash
X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=commitdiff_plain;h=66159b38aad38d55f84a358a0c2ed2add9a2946d;hp=255732f0d33268aeababb1b3ce37a1defb5bc965;p=youtube-dl

Merge pull request #8408 from remitamine/dash

Add generic support for mpd manifests(dash formats)
---

diff --git a/README.md b/README.md
index 7c582511f..79cd08df4 100644
--- a/README.md
+++ b/README.md
@@ -455,6 +455,8 @@ The `-o` option allows users to indicate a template for the output file names. T
  - `format_id`: The sequence will be replaced by the format code specified by `--format`.
  - `duration`: The sequence will be replaced by the length of the video in seconds.
 
+Note that some of the aforementioned sequences are not guaranteed to be present since they depend on the metadata obtained by particular extractor, such sequences will be replaced with `NA`.
+
 The current default template is `%(title)s-%(id)s.%(ext)s`.
 
 In some cases, you don't want special characters such as ä¸­, spaces, or &, such as when transferring the downloaded filename to a Windows system or the filename through an 8bit-unsafe channel. In these cases, add the `--restrict-filenames` flag to get a shorter title:
diff --git a/docs/supportedsites.md b/docs/supportedsites.md
index 61be9990d..ee34adf26 100644
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@@ -91,6 +91,7 @@
  - **Canvas**
  - **CBS**
  - **CBSNews**: CBS News
+ - **CBSNewsLiveVideo**: CBS News Live Videos
  - **CBSSports**
  - **CeskaTelevize**
  - **channel9**: Channel 9
diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py
index 73910eaec..88c63010e 100644
--- a/test/test_YoutubeDL.py
+++ b/test/test_YoutubeDL.py
@@ -248,6 +248,17 @@ class TestFormatSelection(unittest.TestCase):
 
         def format_info(f_id):
             info = YoutubeIE._formats[f_id].copy()
+
+            # XXX: In real cases InfoExtractor._parse_mpd() fills up 'acodec'
+            # and 'vcodec', while in tests such information is incomplete since
+            # commit a6c2c24479e5f4827ceb06f64d855329c0a6f593
+            # test_YoutubeDL.test_youtube_format_selection is broken without
+            # this fix
+            if 'acodec' in info and 'vcodec' not in info:
+                info['vcodec'] = 'none'
+            elif 'vcodec' in info and 'acodec' not in info:
+                info['acodec'] = 'none'
+
             info['format_id'] = f_id
             info['url'] = 'url:' + f_id
             return info
diff --git a/test/test_subtitles.py b/test/test_subtitles.py
index 9a695c4e8..27e763edd 100644
--- a/test/test_subtitles.py
+++ b/test/test_subtitles.py
@@ -65,16 +65,16 @@ class TestYoutubeSubtitles(BaseTestSubtitles):
         self.DL.params['allsubtitles'] = True
         subtitles = self.getSubtitles()
         self.assertEqual(len(subtitles.keys()), 13)
-        self.assertEqual(md5(subtitles['en']), '4cd9278a35ba2305f47354ee13472260')
-        self.assertEqual(md5(subtitles['it']), '164a51f16f260476a05b50fe4c2f161d')
-        for lang in ['it', 'fr', 'de']:
+        self.assertEqual(md5(subtitles['en']), '3cb210999d3e021bd6c7f0ea751eab06')
+        self.assertEqual(md5(subtitles['it']), '6d752b98c31f1cf8d597050c7a2cb4b5')
+        for lang in ['fr', 'de']:
             self.assertTrue(subtitles.get(lang) is not None, 'Subtitles for \'%s\' not extracted' % lang)
 
-    def test_youtube_subtitles_sbv_format(self):
+    def test_youtube_subtitles_ttml_format(self):
         self.DL.params['writesubtitles'] = True
-        self.DL.params['subtitlesformat'] = 'sbv'
+        self.DL.params['subtitlesformat'] = 'ttml'
         subtitles = self.getSubtitles()
-        self.assertEqual(md5(subtitles['en']), '13aeaa0c245a8bed9a451cb643e3ad8b')
+        self.assertEqual(md5(subtitles['en']), 'e306f8c42842f723447d9f63ad65df54')
 
     def test_youtube_subtitles_vtt_format(self):
         self.DL.params['writesubtitles'] = True
diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py
index e61a88de7..2fbc7f812 100644
--- a/youtube_dl/extractor/__init__.py
+++ b/youtube_dl/extractor/__init__.py
@@ -90,7 +90,10 @@ from .canalplus import CanalplusIE
 from .canalc2 import Canalc2IE
 from .canvas import CanvasIE
 from .cbs import CBSIE
-from .cbsnews import CBSNewsIE
+from .cbsnews import (
+    CBSNewsIE,
+    CBSNewsLiveVideoIE,
+)
 from .cbssports import CBSSportsIE
 from .ccc import CCCIE
 from .ceskatelevize import CeskaTelevizeIE
@@ -819,7 +822,11 @@ from .videomore import (
 )
 from .videopremium import VideoPremiumIE
 from .videott import VideoTtIE
-from .vidme import VidmeIE
+from .vidme import (
+    VidmeIE,
+    VidmeUserIE,
+    VidmeUserLikesIE,
+)
 from .vidzi import VidziIE
 from .vier import VierIE, VierVideosIE
 from .viewster import ViewsterIE
diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py
index b9e07f0ef..6ed855a57 100644
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@@ -13,6 +13,7 @@ from ..utils import (
     unified_strdate,
     get_element_by_attribute,
     int_or_none,
+    NO_DEFAULT,
     qualities,
 )
 
@@ -93,9 +94,18 @@ class ArteTVPlus7IE(InfoExtractor):
         json_url = self._html_search_regex(
             patterns, webpage, 'json vp url', default=None)
         if not json_url:
-            iframe_url = self._html_search_regex(
-                r'<iframe[^>]+src=(["\'])(?P<url>.+\bjson_url=.+?)\1',
-                webpage, 'iframe url', group='url')
+            def find_iframe_url(webpage, default=NO_DEFAULT):
+                return self._html_search_regex(
+                    r'<iframe[^>]+src=(["\'])(?P<url>.+\bjson_url=.+?)\1',
+                    webpage, 'iframe url', group='url', default=default)
+
+            iframe_url = find_iframe_url(webpage, None)
+            if not iframe_url:
+                embed_url = self._html_search_regex(
+                    r'arte_vp_url_oembed=\'([^\']+?)\'', webpage, 'embed url')
+                player = self._download_json(
+                    embed_url, video_id, 'Downloading player page')
+                iframe_url = find_iframe_url(player['html'])
             json_url = compat_parse_qs(
                 compat_urllib_parse_urlparse(iframe_url).query)['json_url'][0]
         return self._extract_from_json_url(json_url, video_id, lang)
diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py
index cabf7e73b..8f864699f 100644
--- a/youtube_dl/extractor/cbsnews.py
+++ b/youtube_dl/extractor/cbsnews.py
@@ -1,15 +1,14 @@
 # encoding: utf-8
 from __future__ import unicode_literals
 
-import re
-import json
-
+from .common import InfoExtractor
 from .theplatform import ThePlatformIE
+from ..utils import parse_duration
 
 
 class CBSNewsIE(ThePlatformIE):
     IE_DESC = 'CBS News'
-    _VALID_URL = r'http://(?:www\.)?cbsnews\.com/(?:[^/]+/)+(?P<id>[\da-z_-]+)'
+    _VALID_URL = r'http://(?:www\.)?cbsnews\.com/(?:news|videos)/(?P<id>[\da-z_-]+)'
 
     _TESTS = [
         {
@@ -48,14 +47,13 @@ class CBSNewsIE(ThePlatformIE):
     ]
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
+        video_id = self._match_id(url)
 
         webpage = self._download_webpage(url, video_id)
 
-        video_info = json.loads(self._html_search_regex(
+        video_info = self._parse_json(self._html_search_regex(
             r'(?:<ul class="media-list items" id="media-related-items"><li data-video-info|<div id="cbsNewsVideoPlayer" data-video-player-options)=\'({.+?})\'',
-            webpage, 'video JSON info'))
+            webpage, 'video JSON info'), video_id)
 
         item = video_info['item'] if 'item' in video_info else video_info
         title = item.get('articleTitle') or item.get('hed')
@@ -88,3 +86,41 @@ class CBSNewsIE(ThePlatformIE):
             'formats': formats,
             'subtitles': subtitles,
         }
+
+
+class CBSNewsLiveVideoIE(InfoExtractor):
+    IE_DESC = 'CBS News Live Videos'
+    _VALID_URL = r'http://(?:www\.)?cbsnews\.com/live/video/(?P<id>[\da-z_-]+)'
+
+    _TEST = {
+        'url': 'http://www.cbsnews.com/live/video/clinton-sanders-prepare-to-face-off-in-nh/',
+        'info_dict': {
+            'id': 'clinton-sanders-prepare-to-face-off-in-nh',
+            'ext': 'flv',
+            'title': 'Clinton, Sanders Prepare To Face Off In NH',
+            'duration': 334,
+        },
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        video_info = self._parse_json(self._html_search_regex(
+            r'data-story-obj=\'({.+?})\'', webpage, 'video JSON info'), video_id)['story']
+
+        hdcore_sign = 'hdcore=3.3.1'
+        f4m_formats = self._extract_f4m_formats(video_info['url'] + '&' + hdcore_sign, video_id)
+        if f4m_formats:
+            for entry in f4m_formats:
+                # URLs without the extra param induce an 404 error
+                entry.update({'extra_param_to_segment_url': hdcore_sign})
+
+        return {
+            'id': video_id,
+            'title': video_info['headline'],
+            'thumbnail': video_info.get('thumbnail_url_hd') or video_info.get('thumbnail_url_sd'),
+            'duration': parse_duration(video_info.get('segmentDur')),
+            'formats': f4m_formats,
+        }
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index b18e734c4..c02fe201c 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -1229,19 +1229,24 @@ class GenericIE(InfoExtractor):
 
         # Check for direct link to a video
         content_type = head_response.headers.get('Content-Type', '')
-        m = re.match(r'^(?P<type>audio|video|application(?=/ogg$))/(?P<format_id>.+)$', content_type)
+        m = re.match(r'^(?P<type>audio|video|application(?=/(?:ogg$|(?:vnd\.apple\.|x-)?mpegurl)))/(?P<format_id>.+)$', content_type)
         if m:
             upload_date = unified_strdate(
                 head_response.headers.get('Last-Modified'))
+            formats = []
+            if m.group('format_id').endswith('mpegurl'):
+                formats = self._extract_m3u8_formats(url, video_id, 'mp4')
+            else:
+                formats = [{
+                    'format_id': m.group('format_id'),
+                    'url': url,
+                    'vcodec': 'none' if m.group('type') == 'audio' else None
+                }]
             return {
                 'id': video_id,
                 'title': compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]),
                 'direct': True,
-                'formats': [{
-                    'format_id': m.group('format_id'),
-                    'url': url,
-                    'vcodec': 'none' if m.group('type') == 'audio' else None
-                }],
+                'formats': formats,
                 'upload_date': upload_date,
             }
 
diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py
index 0c8ed5d07..f641edef8 100644
--- a/youtube_dl/extractor/kuwo.py
+++ b/youtube_dl/extractor/kuwo.py
@@ -31,6 +31,10 @@ class KuwoBaseIE(InfoExtractor):
                 (file_format['ext'], file_format.get('br', ''), song_id),
                 song_id, note='Download %s url info' % file_format['format'],
             )
+
+            if song_url == 'IPDeny':
+                raise ExtractorError('This song is blocked in this region', expected=True)
+
             if song_url.startswith('http://') or song_url.startswith('https://'):
                 formats.append({
                     'url': song_url,
diff --git a/youtube_dl/extractor/spankbang.py b/youtube_dl/extractor/spankbang.py
index 3cfa671ed..50433d0f6 100644
--- a/youtube_dl/extractor/spankbang.py
+++ b/youtube_dl/extractor/spankbang.py
@@ -7,7 +7,7 @@ from .common import InfoExtractor
 
 class SpankBangIE(InfoExtractor):
     _VALID_URL = r'https?://(?:(?:www|[a-z]{2})\.)?spankbang\.com/(?P<id>[\da-z]+)/video'
-    _TEST = {
+    _TESTS = [{
         'url': 'http://spankbang.com/3vvn/video/fantasy+solo',
         'md5': '1cc433e1d6aa14bc376535b8679302f7',
         'info_dict': {
@@ -19,7 +19,11 @@ class SpankBangIE(InfoExtractor):
             'uploader': 'silly2587',
             'age_limit': 18,
         }
-    }
+    }, {
+        # 480p only
+        'url': 'http://spankbang.com/1vt0/video/solvane+gangbang',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
@@ -34,7 +38,8 @@ class SpankBangIE(InfoExtractor):
             'ext': 'mp4',
             'format_id': '%sp' % height,
             'height': int(height),
-        } for height in re.findall(r'<(?:span|li)[^>]+q_(\d+)p', webpage)]
+        } for height in re.findall(r'<(?:span|li|p)[^>]+[qb]_(\d+)p', webpage)]
+        self._check_formats(formats, video_id)
         self._sort_formats(formats)
 
         title = self._html_search_regex(
diff --git a/youtube_dl/extractor/srgssr.py b/youtube_dl/extractor/srgssr.py
index 4707029ca..246970c4d 100644
--- a/youtube_dl/extractor/srgssr.py
+++ b/youtube_dl/extractor/srgssr.py
@@ -70,14 +70,11 @@ class SRGSSRIE(InfoExtractor):
                         asset_url, media_id, 'mp4', 'm3u8_native',
                         m3u8_id=format_id, fatal=False))
                 else:
-                    ext = None
-                    if protocol == 'RTMP':
-                        ext = self._search_regex(r'([a-z0-9]+):[^/]+', asset_url, 'ext')
                     formats.append({
                         'format_id': format_id,
                         'url': asset_url,
                         'preference': preference(quality),
-                        'ext': ext,
+                        'ext': 'flv' if protocol == 'RTMP' else None,
                     })
         self._sort_formats(formats)
 
diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py
index 3d63ed4f0..b1156d531 100644
--- a/youtube_dl/extractor/vidme.py
+++ b/youtube_dl/extractor/vidme.py
@@ -1,5 +1,7 @@
 from __future__ import unicode_literals
 
+import itertools
+
 from .common import InfoExtractor
 from ..compat import compat_HTTPError
 from ..utils import (
@@ -11,7 +13,8 @@ from ..utils import (
 
 
 class VidmeIE(InfoExtractor):
-    _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]+)'
+    IE_NAME = 'vidme'
+    _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]{,5})(?:[^\da-zA-Z]|$)'
     _TESTS = [{
         'url': 'https://vid.me/QNB',
         'md5': 'f42d05e7149aeaec5c037b17e5d3dc82',
@@ -202,3 +205,69 @@ class VidmeIE(InfoExtractor):
             'comment_count': comment_count,
             'formats': formats,
         }
+
+
+class VidmeListBaseIE(InfoExtractor):
+    # Max possible limit according to https://docs.vid.me/#api-Videos-List
+    _LIMIT = 100
+
+    def _entries(self, user_id, user_name):
+        for page_num in itertools.count(1):
+            page = self._download_json(
+                'https://api.vid.me/videos/%s?user=%s&limit=%d&offset=%d'
+                % (self._API_ITEM, user_id, self._LIMIT, (page_num - 1) * self._LIMIT),
+                user_name, 'Downloading user %s page %d' % (self._API_ITEM, page_num))
+
+            videos = page.get('videos', [])
+            if not videos:
+                break
+
+            for video in videos:
+                video_url = video.get('full_url') or video.get('embed_url')
+                if video_url:
+                    yield self.url_result(video_url, VidmeIE.ie_key())
+
+            total = int_or_none(page.get('page', {}).get('total'))
+            if total and self._LIMIT * page_num >= total:
+                break
+
+    def _real_extract(self, url):
+        user_name = self._match_id(url)
+
+        user_id = self._download_json(
+            'https://api.vid.me/userByUsername?username=%s' % user_name,
+            user_name)['user']['user_id']
+
+        return self.playlist_result(
+            self._entries(user_id, user_name), user_id,
+            '%s - %s' % (user_name, self._TITLE))
+
+
+class VidmeUserIE(VidmeListBaseIE):
+    IE_NAME = 'vidme:user'
+    _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]{6,})(?!/likes)(?:[^\da-zA-Z]|$)'
+    _API_ITEM = 'list'
+    _TITLE = 'Videos'
+    _TEST = {
+        'url': 'https://vid.me/EFARCHIVE',
+        'info_dict': {
+            'id': '3834632',
+            'title': 'EFARCHIVE - %s' % _TITLE,
+        },
+        'playlist_mincount': 238,
+    }
+
+
+class VidmeUserLikesIE(VidmeListBaseIE):
+    IE_NAME = 'vidme:user:likes'
+    _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]{6,})/likes'
+    _API_ITEM = 'likes'
+    _TITLE = 'Likes'
+    _TEST = {
+        'url': 'https://vid.me/ErinAlexis/likes',
+        'info_dict': {
+            'id': '6483530',
+            'title': 'ErinAlexis - %s' % _TITLE,
+        },
+        'playlist_mincount': 415,
+    }
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 828f5d1f4..63abe5477 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -369,6 +369,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
         # RTMP (unnamed)
         '_rtmp': {'protocol': 'rtmp'},
     }
+    _SUBTITLE_FORMATS = ('ttml', 'vtt')
 
     IE_NAME = 'youtube'
     _TESTS = [
@@ -918,7 +919,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
             if lang in sub_lang_list:
                 continue
             sub_formats = []
-            for ext in ['sbv', 'vtt', 'srt']:
+            for ext in self._SUBTITLE_FORMATS:
                 params = compat_urllib_parse.urlencode({
                     'lang': lang,
                     'v': video_id,
@@ -988,7 +989,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
             for lang_node in caption_list.findall('target'):
                 sub_lang = lang_node.attrib['lang_code']
                 sub_formats = []
-                for ext in ['sbv', 'vtt', 'srt']:
+                for ext in self._SUBTITLE_FORMATS:
                     params = compat_urllib_parse.urlencode({
                         'lang': original_lang,
                         'tlang': sub_lang,
diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py
index 16a64802a..22d7ac65a 100644
--- a/youtube_dl/postprocessor/ffmpeg.py
+++ b/youtube_dl/postprocessor/ffmpeg.py
@@ -391,6 +391,10 @@ class FFmpegMetadataPP(FFmpegPostProcessor):
         for (name, value) in metadata.items():
             options.extend(['-metadata', '%s=%s' % (name, value)])
 
+        # https://github.com/rg3/youtube-dl/issues/8350
+        if info.get('protocol') == 'm3u8_native' or info.get('protocol') == 'm3u8' and self._downloader.params.get('hls_prefer_native', False):
+            options.extend(['-bsf:a', 'aac_adtstoasc'])
+
         self._downloader.to_screen('[ffmpeg] Adding metadata to \'%s\'' % filename)
         self.run_ffmpeg(filename, temp_filename, options)
         os.remove(encodeFilename(filename))
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index c63b61598..4262ad6ac 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -2017,20 +2017,27 @@ def dfxp2srt(dfxp_data):
         'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
     })
 
-    def parse_node(node):
-        str_or_empty = functools.partial(str_or_none, default='')
+    class TTMLPElementParser(object):
+        out = ''
 
-        out = str_or_empty(node.text)
+        def start(self, tag, attrib):
+            if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
+                self.out += '\n'
 
-        for child in node:
-            if child.tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
-                out += '\n' + str_or_empty(child.tail)
-            elif child.tag in (_x('ttml:span'), _x('ttaf1:span'), 'span'):
-                out += str_or_empty(parse_node(child))
-            else:
-                out += str_or_empty(xml.etree.ElementTree.tostring(child))
+        def end(self, tag):
+            pass
 
-        return out
+        def data(self, data):
+            self.out += data
+
+        def close(self):
+            return self.out.strip()
+
+    def parse_node(node):
+        target = TTMLPElementParser()
+        parser = xml.etree.ElementTree.XMLParser(target=target)
+        parser.feed(xml.etree.ElementTree.tostring(node))
+        return parser.close()
 
     dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
     out = []
diff --git a/youtube_dl/version.py b/youtube_dl/version.py
index 6da42c5a5..3fec14ab1 100644
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,3 +1,3 @@
 from __future__ import unicode_literals
 
-__version__ = '2016.02.01'
+__version__ = '2016.02.05.1'