Merge pull request #11122 from kasper93/openload
authorYen Chi Hsuan <yan12125@gmail.com>
Wed, 16 Nov 2016 12:43:19 +0000 (20:43 +0800)
committerGitHub <noreply@github.com>
Wed, 16 Nov 2016 12:43:19 +0000 (20:43 +0800)
[openload] Fix extraction.

12 files changed:
.github/ISSUE_TEMPLATE.md
ChangeLog
youtube_dl/downloader/f4m.py
youtube_dl/downloader/fragment.py
youtube_dl/downloader/hls.py
youtube_dl/extractor/afreecatv.py
youtube_dl/extractor/cda.py
youtube_dl/extractor/common.py
youtube_dl/extractor/nrk.py
youtube_dl/extractor/plays.py
youtube_dl/extractor/vlive.py
youtube_dl/version.py

index bfae97dddee163cfca367638be2a62a6c4236707..fef9fc7a269fb12375f75fb850b5aadbc9a39b17 100644 (file)
@@ -6,8 +6,8 @@
 
 ---
 
-### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.11.08.1*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
-- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.11.08.1**
+### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2016.11.14.1*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
+- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2016.11.14.1**
 
 ### Before submitting an *issue* make sure you have:
 - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections
@@ -35,7 +35,7 @@ $ youtube-dl -v <your command line>
 [debug] User config: []
 [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj']
 [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251
-[debug] youtube-dl version 2016.11.08.1
+[debug] youtube-dl version 2016.11.14.1
 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2
 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4
 [debug] Proxy map: {}
index d97156e20a787c8cba8ef984a7f088cf1d6eeee9..577709c4428fa0dd6d0cbbb0bda983e5eac1bb86 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,6 +1,18 @@
-version <unreleased>
+version 2016.11.14.1
 
-Extractors
+Core
++ [downoader/fragment,f4m,hls] Respect HTTP headers from info dict
+* [extractor/common] Fix media templates with Bandwidth substitution pattern in
+  MPD manifests (#11175)
+* [extractor/common] Improve thumbnail extraction from JSON-LD
+
+Extractors
++ [nrk] Workaround geo restriction
++ [nrk] Improve error detection and messages
++ [afreecatv] Add support for vod.afreecatv.com (#11174)
+* [cda] Fix and improve extraction (#10929, #10936)
+* [plays] Fix extraction (#11165)
+* [eagleplatform] Fix extraction (#11160)
 + [audioboom] Recognize /posts/ URLs (#11149)
 
 
index 80c21d40bc88382a64634b0eeb9daa3eaaccc303..688e086eb0536c55ef184ae68fa09a6ffb41462d 100644 (file)
@@ -314,7 +314,8 @@ class F4mFD(FragmentFD):
         man_url = info_dict['url']
         requested_bitrate = info_dict.get('tbr')
         self.to_screen('[%s] Downloading f4m manifest' % self.FD_NAME)
-        urlh = self.ydl.urlopen(man_url)
+
+        urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url))
         man_url = urlh.geturl()
         # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
         # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244
@@ -387,7 +388,10 @@ class F4mFD(FragmentFD):
             url_parsed = base_url_parsed._replace(path=base_url_parsed.path + name, query='&'.join(query))
             frag_filename = '%s-%s' % (ctx['tmpfilename'], name)
             try:
-                success = ctx['dl'].download(frag_filename, {'url': url_parsed.geturl()})
+                success = ctx['dl'].download(frag_filename, {
+                    'url': url_parsed.geturl(),
+                    'http_headers': info_dict.get('http_headers'),
+                })
                 if not success:
                     return False
                 (down, frag_sanitized) = sanitize_open(frag_filename, 'rb')
index 84aacf7db6b839d6bf52f6254b58f1323822290b..60df627a65dfc589899f009fa5df9ce76a441ae5 100644 (file)
@@ -9,6 +9,7 @@ from ..utils import (
     error_to_compat_str,
     encodeFilename,
     sanitize_open,
+    sanitized_Request,
 )
 
 
@@ -37,6 +38,10 @@ class FragmentFD(FileDownloader):
     def report_skip_fragment(self, fragment_name):
         self.to_screen('[download] Skipping fragment %s...' % fragment_name)
 
+    def _prepare_url(self, info_dict, url):
+        headers = info_dict.get('http_headers')
+        return sanitized_Request(url, None, headers) if headers else url
+
     def _prepare_and_start_frag_download(self, ctx):
         self._prepare_frag_download(ctx)
         self._start_frag_download(ctx)
index 541b92ee122261f8230ede54e57c07b68dc40cac..7373ec05fd0d4a1d983f48668229b21d98977581 100644 (file)
@@ -59,7 +59,8 @@ class HlsFD(FragmentFD):
     def real_download(self, filename, info_dict):
         man_url = info_dict['url']
         self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME)
-        manifest = self.ydl.urlopen(man_url).read()
+
+        manifest = self.ydl.urlopen(self._prepare_url(info_dict, man_url)).read()
 
         s = manifest.decode('utf-8', 'ignore')
 
@@ -112,7 +113,10 @@ class HlsFD(FragmentFD):
                     count = 0
                     while count <= fragment_retries:
                         try:
-                            success = ctx['dl'].download(frag_filename, {'url': frag_url})
+                            success = ctx['dl'].download(frag_filename, {
+                                'url': frag_url,
+                                'http_headers': info_dict.get('http_headers'),
+                            })
                             if not success:
                                 return False
                             down, frag_sanitized = sanitize_open(frag_filename, 'rb')
index 518c61f67eb0befa0ce59fb393d10d8ebd4dcc03..75b36699363609876c755d4c120ec195aa81ec3a 100644 (file)
@@ -11,6 +11,7 @@ from ..compat import (
 from ..utils import (
     ExtractorError,
     int_or_none,
+    update_url_query,
     xpath_element,
     xpath_text,
 )
@@ -18,12 +19,18 @@ from ..utils import (
 
 class AfreecaTVIE(InfoExtractor):
     IE_DESC = 'afreecatv.com'
-    _VALID_URL = r'''(?x)^
-        https?://(?:(live|afbbs|www)\.)?afreeca(?:tv)?\.com(?::\d+)?
-        (?:
-            /app/(?:index|read_ucc_bbs)\.cgi|
-            /player/[Pp]layer\.(?:swf|html))
-        \?.*?\bnTitleNo=(?P<id>\d+)'''
+    _VALID_URL = r'''(?x)
+                    https?://
+                        (?:
+                            (?:(?:live|afbbs|www)\.)?afreeca(?:tv)?\.com(?::\d+)?
+                            (?:
+                                /app/(?:index|read_ucc_bbs)\.cgi|
+                                /player/[Pp]layer\.(?:swf|html)
+                            )\?.*?\bnTitleNo=|
+                            vod\.afreecatv\.com/PLAYER/STATION/
+                        )
+                        (?P<id>\d+)
+                    '''
     _TESTS = [{
         'url': 'http://live.afreecatv.com:8079/app/index.cgi?szType=read_ucc_bbs&szBjId=dailyapril&nStationNo=16711924&nBbsNo=18605867&nTitleNo=36164052&szSkin=',
         'md5': 'f72c89fe7ecc14c1b5ce506c4996046e',
@@ -66,6 +73,9 @@ class AfreecaTVIE(InfoExtractor):
     }, {
         'url': 'http://www.afreecatv.com/player/Player.swf?szType=szBjId=djleegoon&nStationNo=11273158&nBbsNo=13161095&nTitleNo=36327652',
         'only_matching': True,
+    }, {
+        'url': 'http://vod.afreecatv.com/PLAYER/STATION/15055030',
+        'only_matching': True,
     }]
 
     @staticmethod
@@ -83,7 +93,9 @@ class AfreecaTVIE(InfoExtractor):
         info_url = compat_urlparse.urlunparse(parsed_url._replace(
             netloc='afbbs.afreecatv.com:8080',
             path='/api/video/get_video_info.php'))
-        video_xml = self._download_xml(info_url, video_id)
+
+        video_xml = self._download_xml(
+            update_url_query(info_url, {'nTitleNo': video_id}), video_id)
 
         if xpath_element(video_xml, './track/video/file') is None:
             raise ExtractorError('Specified AfreecaTV video does not exist',
index 8af318703b0ae9ad57fedb49c7f320288322caa9..e00bdaf66a6d9eb6ac051cc169cabbf02844770b 100755 (executable)
@@ -5,14 +5,16 @@ import re
 
 from .common import InfoExtractor
 from ..utils import (
-    decode_packed_codes,
     ExtractorError,
-    parse_duration
+    float_or_none,
+    int_or_none,
+    parse_duration,
 )
 
 
 class CDAIE(InfoExtractor):
     _VALID_URL = r'https?://(?:(?:www\.)?cda\.pl/video|ebd\.cda\.pl/[0-9]+x[0-9]+)/(?P<id>[0-9a-z]+)'
+    _BASE_URL = 'http://www.cda.pl/'
     _TESTS = [{
         'url': 'http://www.cda.pl/video/5749950c',
         'md5': '6f844bf51b15f31fae165365707ae970',
@@ -21,6 +23,9 @@ class CDAIE(InfoExtractor):
             'ext': 'mp4',
             'height': 720,
             'title': 'Oto dlaczego przed zakrętem należy zwolnić.',
+            'description': 'md5:269ccd135d550da90d1662651fcb9772',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'average_rating': float,
             'duration': 39
         }
     }, {
@@ -30,6 +35,11 @@ class CDAIE(InfoExtractor):
             'id': '57413289',
             'ext': 'mp4',
             'title': 'Lądowanie na lotnisku na Maderze',
+            'description': 'md5:60d76b71186dcce4e0ba6d4bbdb13e1a',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'uploader': 'crash404',
+            'view_count': int,
+            'average_rating': float,
             'duration': 137
         }
     }, {
@@ -39,31 +49,55 @@ class CDAIE(InfoExtractor):
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
-        webpage = self._download_webpage('http://ebd.cda.pl/0x0/' + video_id, video_id)
+        self._set_cookie('cda.pl', 'cda.player', 'html5')
+        webpage = self._download_webpage(
+            self._BASE_URL + '/video/' + video_id, video_id)
 
         if 'Ten film jest dostępny dla użytkowników premium' in webpage:
             raise ExtractorError('This video is only available for premium users.', expected=True)
 
-        title = self._html_search_regex(r'<title>(.+?)</title>', webpage, 'title')
-
         formats = []
 
+        uploader = self._search_regex(r'''(?x)
+            <(span|meta)[^>]+itemprop=(["\'])author\2[^>]*>
+            (?:<\1[^>]*>[^<]*</\1>|(?!</\1>)(?:.|\n))*?
+            <(span|meta)[^>]+itemprop=(["\'])name\4[^>]*>(?P<uploader>[^<]+)</\3>
+        ''', webpage, 'uploader', default=None, group='uploader')
+        view_count = self._search_regex(
+            r'Odsłony:(?:\s|&nbsp;)*([0-9]+)', webpage,
+            'view_count', default=None)
+        average_rating = self._search_regex(
+            r'<(?:span|meta)[^>]+itemprop=(["\'])ratingValue\1[^>]*>(?P<rating_value>[0-9.]+)',
+            webpage, 'rating', fatal=False, group='rating_value')
+
         info_dict = {
             'id': video_id,
-            'title': title,
+            'title': self._og_search_title(webpage),
+            'description': self._og_search_description(webpage),
+            'uploader': uploader,
+            'view_count': int_or_none(view_count),
+            'average_rating': float_or_none(average_rating),
+            'thumbnail': self._og_search_thumbnail(webpage),
             'formats': formats,
             'duration': None,
         }
 
         def extract_format(page, version):
-            unpacked = decode_packed_codes(page)
-            format_url = self._search_regex(
-                r"(?:file|url)\s*:\s*(\\?[\"'])(?P<url>http.+?)\1", unpacked,
-                '%s url' % version, fatal=False, group='url')
-            if not format_url:
+            json_str = self._search_regex(
+                r'player_data=(\\?["\'])(?P<player_data>.+?)\1', page,
+                '%s player_json' % version, fatal=False, group='player_data')
+            if not json_str:
+                return
+            player_data = self._parse_json(
+                json_str, '%s player_data' % version, fatal=False)
+            if not player_data:
+                return
+            video = player_data.get('video')
+            if not video or 'file' not in video:
+                self.report_warning('Unable to extract %s version information' % version)
                 return
             f = {
-                'url': format_url,
+                'url': video['file'],
             }
             m = re.search(
                 r'<a[^>]+data-quality="(?P<format_id>[^"]+)"[^>]+href="[^"]+"[^>]+class="[^"]*quality-btn-active[^"]*">(?P<height>[0-9]+)p',
@@ -75,9 +109,7 @@ class CDAIE(InfoExtractor):
                 })
             info_dict['formats'].append(f)
             if not info_dict['duration']:
-                info_dict['duration'] = parse_duration(self._search_regex(
-                    r"duration\s*:\s*(\\?[\"'])(?P<duration>.+?)\1",
-                    unpacked, 'duration', fatal=False, group='duration'))
+                info_dict['duration'] = parse_duration(video.get('duration'))
 
         extract_format(webpage, 'default')
 
@@ -85,7 +117,8 @@ class CDAIE(InfoExtractor):
                 r'<a[^>]+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)',
                 webpage):
             webpage = self._download_webpage(
-                href, video_id, 'Downloading %s version information' % resolution, fatal=False)
+                self._BASE_URL + href, video_id,
+                'Downloading %s version information' % resolution, fatal=False)
             if not webpage:
                 # Manually report warning because empty page is returned when
                 # invalid version is requested.
index 5f4c984a9db08612c6bc240bd543a77e739cfcfd..05c51fac9b0b4162fb126cb79a79d871b591ead8 100644 (file)
@@ -886,7 +886,7 @@ class InfoExtractor(object):
                         'url': e.get('contentUrl'),
                         'title': unescapeHTML(e.get('name')),
                         'description': unescapeHTML(e.get('description')),
-                        'thumbnail': e.get('thumbnailUrl'),
+                        'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
                         'duration': parse_duration(e.get('duration')),
                         'timestamp': unified_timestamp(e.get('uploadDate')),
                         'filesize': float_or_none(e.get('contentSize')),
@@ -1703,7 +1703,7 @@ class InfoExtractor(object):
                                 representation_ms_info['fragments'] = [{
                                     'url': media_template % {
                                         'Number': segment_number,
-                                        'Bandwidth': representation_attrib.get('bandwidth'),
+                                        'Bandwidth': int_or_none(representation_attrib.get('bandwidth')),
                                     },
                                     'duration': segment_duration,
                                 } for segment_number in range(
@@ -1721,7 +1721,7 @@ class InfoExtractor(object):
                                 def add_segment_url():
                                     segment_url = media_template % {
                                         'Time': segment_time,
-                                        'Bandwidth': representation_attrib.get('bandwidth'),
+                                        'Bandwidth': int_or_none(representation_attrib.get('bandwidth')),
                                         'Number': segment_number,
                                     }
                                     representation_ms_info['fragments'].append({
index 3700b7ab2eec4070fee53258bb0707d79aa37728..c89aac63ee90f133074d8ade8b7af23cf020f148 100644 (file)
@@ -1,6 +1,7 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
+import random
 import re
 
 from .common import InfoExtractor
@@ -14,6 +15,25 @@ from ..utils import (
 
 
 class NRKBaseIE(InfoExtractor):
+    _faked_ip = None
+
+    def _download_webpage_handle(self, *args, **kwargs):
+        # NRK checks X-Forwarded-For HTTP header in order to figure out the
+        # origin of the client behind proxy. This allows to bypass geo
+        # restriction by faking this header's value to some Norway IP.
+        # We will do so once we encounter any geo restriction error.
+        if self._faked_ip:
+            # NB: str is intentional
+            kwargs.setdefault(str('headers'), {})['X-Forwarded-For'] = self._faked_ip
+        return super(NRKBaseIE, self)._download_webpage_handle(*args, **kwargs)
+
+    def _fake_ip(self):
+        # Use fake IP from 37.191.128.0/17 in order to workaround geo
+        # restriction
+        def octet(lb=0, ub=255):
+            return random.randint(lb, ub)
+        self._faked_ip = '37.191.%d.%d' % (octet(128), octet())
+
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
@@ -24,6 +44,8 @@ class NRKBaseIE(InfoExtractor):
         title = data.get('fullTitle') or data.get('mainTitle') or data['title']
         video_id = data.get('id') or video_id
 
+        http_headers = {'X-Forwarded-For': self._faked_ip} if self._faked_ip else {}
+
         entries = []
 
         media_assets = data.get('mediaAssets')
@@ -54,6 +76,7 @@ class NRKBaseIE(InfoExtractor):
                     'duration': duration,
                     'subtitles': subtitles,
                     'formats': formats,
+                    'http_headers': http_headers,
                 })
 
         if not entries:
@@ -70,10 +93,23 @@ class NRKBaseIE(InfoExtractor):
                 }]
 
         if not entries:
-            if data.get('usageRights', {}).get('isGeoBlocked'):
-                raise ExtractorError(
-                    'NRK har ikke rettigheter til å vise dette programmet utenfor Norge',
-                    expected=True)
+            message_type = data.get('messageType', '')
+            # Can be ProgramIsGeoBlocked or ChannelIsGeoBlocked*
+            if 'IsGeoBlocked' in message_type and not self._faked_ip:
+                self.report_warning(
+                    'Video is geo restricted, trying to fake IP')
+                self._fake_ip()
+                return self._real_extract(url)
+
+            MESSAGES = {
+                'ProgramRightsAreNotReady': 'Du kan dessverre ikke se eller høre programmet',
+                'ProgramRightsHasExpired': 'Programmet har gått ut',
+                'ProgramIsGeoBlocked': 'NRK har ikke rettigheter til å vise dette programmet utenfor Norge',
+            }
+            raise ExtractorError(
+                '%s said: %s' % (self.IE_NAME, MESSAGES.get(
+                    message_type, message_type)),
+                expected=True)
 
         conviva = data.get('convivaStatistics') or {}
         series = conviva.get('seriesName') or data.get('seriesTitle')
index c3c38cf4ac07787e520c7c2c7eac7da1ed2aa8b4..ddfc6f1486c4b49185bf68b3be3ff9ba9e957633 100644 (file)
@@ -8,30 +8,31 @@ from ..utils import int_or_none
 
 
 class PlaysTVIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?plays\.tv/video/(?P<id>[0-9a-f]{18})'
-    _TEST = {
-        'url': 'http://plays.tv/video/56af17f56c95335490/when-you-outplay-the-azir-wall',
+    _VALID_URL = r'https?://(?:www\.)?plays\.tv/(?:video|embeds)/(?P<id>[0-9a-f]{18})'
+    _TESTS = [{
+        'url': 'https://plays.tv/video/56af17f56c95335490/when-you-outplay-the-azir-wall',
         'md5': 'dfeac1198506652b5257a62762cec7bc',
         'info_dict': {
             'id': '56af17f56c95335490',
             'ext': 'mp4',
-            'title': 'When you outplay the Azir wall',
+            'title': 'Bjergsen - When you outplay the Azir wall',
             'description': 'Posted by Bjergsen',
         }
-    }
+    }, {
+        'url': 'https://plays.tv/embeds/56af17f56c95335490',
+        'only_matching': True,
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
-        webpage = self._download_webpage(url, video_id)
+        webpage = self._download_webpage(
+            'https://plays.tv/video/%s' % video_id, video_id)
+
+        info = self._search_json_ld(webpage, video_id,)
 
-        title = self._og_search_title(webpage)
-        content = self._parse_json(
-            self._search_regex(
-                r'R\.bindContent\(({.+?})\);', webpage,
-                'content'), video_id)['content']
         mpd_url, sources = re.search(
             r'(?s)<video[^>]+data-mpd="([^"]+)"[^>]*>(.+?)</video>',
-            content).groups()
+            webpage).groups()
         formats = self._extract_mpd_formats(
             self._proto_relative_url(mpd_url), video_id, mpd_id='DASH')
         for format_id, height, format_url in re.findall(r'<source\s+res="((\d+)h?)"\s+src="([^"]+)"', sources):
@@ -42,10 +43,11 @@ class PlaysTVIE(InfoExtractor):
             })
         self._sort_formats(formats)
 
-        return {
+        info.update({
             'id': video_id,
-            'title': title,
             'description': self._og_search_description(webpage),
-            'thumbnail': self._og_search_thumbnail(webpage),
+            'thumbnail': info.get('thumbnail') or self._og_search_thumbnail(webpage),
             'formats': formats,
-        }
+        })
+
+        return info
index 8d671cca767d4592a5428f7d3ad855e952df5353..acf9fda487f6143906b8162a158c1cf9f53fec68 100644 (file)
@@ -17,7 +17,7 @@ from ..compat import compat_urllib_parse_urlencode
 class VLiveIE(InfoExtractor):
     IE_NAME = 'vlive'
     _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/video/(?P<id>[0-9]+)'
-    _TEST = {
+    _TESTS = [{
         'url': 'http://www.vlive.tv/video/1326',
         'md5': 'cc7314812855ce56de70a06a27314983',
         'info_dict': {
@@ -27,7 +27,20 @@ class VLiveIE(InfoExtractor):
             'creator': "Girl's Day",
             'view_count': int,
         },
-    }
+    }, {
+        'url': 'http://www.vlive.tv/video/16937',
+        'info_dict': {
+            'id': '16937',
+            'ext': 'mp4',
+            'title': '[V LIVE] 첸백시 걍방',
+            'creator': 'EXO',
+            'view_count': int,
+            'subtitles': 'mincount:12',
+        },
+        'params': {
+            'skip_download': True,
+        },
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
@@ -116,7 +129,7 @@ class VLiveIE(InfoExtractor):
 
         subtitles = {}
         for caption in playinfo.get('captions', {}).get('list', []):
-            lang = dict_get(caption, ('language', 'locale', 'country', 'label'))
+            lang = dict_get(caption, ('locale', 'language', 'country', 'label'))
             if lang and caption.get('source'):
                 subtitles[lang] = [{
                     'ext': 'vtt',
index 69df88c6e83690467baccefa5f32126929b7eb1d..9557b2000d19990dea023af0f5b11945fc4eb2c4 100644 (file)
@@ -1,3 +1,3 @@
 from __future__ import unicode_literals
 
-__version__ = '2016.11.08.1'
+__version__ = '2016.11.14.1'