Merge pull request #12752 from triple-j/go90_improvements_pull_request
authorremitamine <remitamine@gmail.com>
Sat, 15 Apr 2017 21:48:13 +0000 (23:48 +0200)
committerGitHub <noreply@github.com>
Sat, 15 Apr 2017 21:48:13 +0000 (23:48 +0200)
[go90] Improve extraction

.github/ISSUE_TEMPLATE.md
ChangeLog
docs/supportedsites.md
test/test_YoutubeDL.py
youtube_dl/YoutubeDL.py
youtube_dl/extractor/brightcove.py
youtube_dl/extractor/extractors.py
youtube_dl/extractor/generic.py
youtube_dl/extractor/streamango.py [new file with mode: 0644]
youtube_dl/extractor/wsj.py
youtube_dl/version.py

index 70f6b51ed2fe4c34ad8f7332ff0738e8366e702a..5d5adb199b029fb214d5310ccdd054da24926584 100644 (file)
@@ -6,8 +6,8 @@
 
 ---
 
-### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.04.15*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
-- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.04.15**
+### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.04.16*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
+- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.04.16**
 
 ### Before submitting an *issue* make sure you have:
 - [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections
@@ -35,7 +35,7 @@ $ youtube-dl -v <your command line>
 [debug] User config: []
 [debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj']
 [debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251
-[debug] youtube-dl version 2017.04.15
+[debug] youtube-dl version 2017.04.16
 [debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2
 [debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4
 [debug] Proxy map: {}
index cf5ee84a405f9a8a2b617855cf15bedd468bfc69..6be86a090a078ba6034e2cb2d859daf1f118c28e 100644 (file)
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,19 @@
+version 2017.04.16
+
+Core
+* [YoutubeDL] Apply expand_path after output template substitution
++ [YoutubeDL] Propagate overridden meta fields to extraction results of type
+  url (#11163)
+
+Extractors
++ [generic] Extract RSS entries as url_transparent (#11163)
++ [streamango] Add support for streamango.com (#12643)
++ [wsj:article] Add support for articles (#12558)
+* [brightcove] Relax video tag embeds extraction and validate ambiguous embeds'
+  URLs (#9163, #12005, #12178, #12480)
++ [udemy] Add support for react rendition (#12744)
+
+
 version 2017.04.15
 
 Extractors
index b29b50c8dcdabc12ca37e3edeb7c5d670b8021dc..afae8221430b953c7668ab6265b7e0ea027c8e0b 100644 (file)
  - **Steam**
  - **Stitcher**
  - **Streamable**
+ - **Streamango**
  - **streamcloud.eu**
  - **StreamCZ**
  - **StreetVoice**
  - **wrzuta.pl**
  - **wrzuta.pl:playlist**
  - **WSJ**: Wall Street Journal
+ - **WSJArticle**
  - **XBef**
  - **XboxClips**
  - **XFileShare**: XFileShare based sites: DaClips, FileHoot, GorillaVid, MovPod, PowerWatch, Rapidvideo.ws, TheVideoBee, Vidto, Streamin.To, XVIDSTAGE, Vid ABC, VidBom, vidlo
index 8491a88bd2f4730cd1af4e39d936966d6e1d3bb1..75945e38f837fc11496856180336fc2828121cca 100644 (file)
@@ -755,6 +755,7 @@ class TestYoutubeDL(unittest.TestCase):
                     '_type': 'url_transparent',
                     'url': 'foo2:',
                     'ie_key': 'Foo2',
+                    'title': 'foo1 title'
                 }
 
         class Foo2IE(InfoExtractor):
@@ -771,7 +772,7 @@ class TestYoutubeDL(unittest.TestCase):
             _VALID_URL = r'foo3:'
 
             def _real_extract(self, url):
-                return _make_result([{'url': TEST_URL}])
+                return _make_result([{'url': TEST_URL}], title='foo3 title')
 
         ydl.add_info_extractor(Foo1IE(ydl))
         ydl.add_info_extractor(Foo2IE(ydl))
@@ -779,6 +780,7 @@ class TestYoutubeDL(unittest.TestCase):
         ydl.extract_info('foo1:')
         downloaded = ydl.downloaded_info_dicts[0]
         self.assertEqual(downloaded['url'], TEST_URL)
+        self.assertEqual(downloaded['title'], 'foo1 title')
 
 
 if __name__ == '__main__':
index 7953670a732eb29abdd249ecadf63be5fc52e029..819b374efe67831d7a6bb829800f47ac200bf1a9 100755 (executable)
@@ -672,8 +672,7 @@ class YoutubeDL(object):
                         FORMAT_RE.format(numeric_field),
                         r'%({0})s'.format(numeric_field), outtmpl)
 
-            tmpl = expand_path(outtmpl)
-            filename = tmpl % template_dict
+            filename = expand_path(outtmpl % template_dict)
             # Temporary fix for #4787
             # 'Treat' all problem characters by passing filename through preferredencoding
             # to workaround encoding issues with subprocess on python2 @ Windows
@@ -851,7 +850,14 @@ class YoutubeDL(object):
             new_result = info.copy()
             new_result.update(force_properties)
 
-            assert new_result.get('_type') != 'url_transparent'
+            # Extracted info may not be a video result (i.e.
+            # info.get('_type', 'video') != video) but rather an url or
+            # url_transparent. In such cases outer metadata (from ie_result)
+            # should be propagated to inner one (info). For this to happen
+            # _type of info should be overridden with url_transparent. This
+            # fixes issue from https://github.com/rg3/youtube-dl/pull/11163.
+            if new_result.get('_type') == 'url':
+                new_result['_type'] = 'url_transparent'
 
             return self.process_ie_result(
                 new_result, download=download, extra_info=extra_info)
index 46ef8e605286ac0b079dfe30720de6a1ba9f48f5..124497e95c7236b21d17419133cdb63cee55f955 100644 (file)
@@ -17,6 +17,7 @@ from ..compat import (
 from ..utils import (
     determine_ext,
     ExtractorError,
+    extract_attributes,
     find_xpath_attr,
     fix_xml_ampersands,
     float_or_none,
@@ -109,6 +110,7 @@ class BrightcoveLegacyIE(InfoExtractor):
                 'upload_date': '20140827',
                 'uploader_id': '710858724001',
             },
+            'skip': 'Video gone',
         },
         {
             # playlist with 'videoList'
@@ -487,12 +489,13 @@ class BrightcoveNewIE(InfoExtractor):
         return urls[0] if urls else None
 
     @staticmethod
-    def _extract_urls(webpage):
+    def _extract_urls(ie, webpage):
         # Reference:
         # 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe
-        # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript
-        # 3. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/embed-in-page.html
-        # 4. https://support.brightcove.com/en/video-cloud/docs/dynamically-assigning-videos-player
+        # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#tag
+        # 3. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript
+        # 4. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/in-page-embed-player-implementation.html
+        # 5. https://support.brightcove.com/en/video-cloud/docs/dynamically-assigning-videos-player
 
         entries = []
 
@@ -501,22 +504,48 @@ class BrightcoveNewIE(InfoExtractor):
                 r'<iframe[^>]+src=(["\'])((?:https?:)?//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage):
             entries.append(url if url.startswith('http') else 'http:' + url)
 
-        # Look for embed_in_page embeds [2]
-        for video_id, account_id, player_id, embed in re.findall(
-                # According to examples from [3] it's unclear whether video id
-                # may be optional and what to do when it is
-                # According to [4] data-video-id may be prefixed with ref:
-                r'''(?sx)
-                    <video[^>]+
-                        data-video-id=["\'](\d+|ref:[^"\']+)["\'][^>]*>.*?
-                    </video>.*?
-                    <script[^>]+
-                        src=["\'](?:https?:)?//players\.brightcove\.net/
-                        (\d+)/([^/]+)_([^/]+)/index(?:\.min)?\.js
+        # Look for <video> tags [2] and embed_in_page embeds [3]
+        # [2] looks like:
+        for video, script_tag, account_id, player_id, embed in re.findall(
+                r'''(?isx)
+                    (<video\s+[^>]+>)
+                    (?:.*?
+                        (<script[^>]+
+                            src=["\'](?:https?:)?//players\.brightcove\.net/
+                            (\d+)/([^/]+)_([^/]+)/index(?:\.min)?\.js
+                        )
+                    )?
                 ''', webpage):
-            entries.append(
-                'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s'
-                % (account_id, player_id, embed, video_id))
+            attrs = extract_attributes(video)
+
+            # According to examples from [4] it's unclear whether video id
+            # may be optional and what to do when it is
+            video_id = attrs.get('data-video-id')
+            if not video_id:
+                continue
+
+            account_id = account_id or attrs.get('data-account')
+            if not account_id:
+                continue
+
+            player_id = player_id or attrs.get('data-player') or 'default'
+            embed = embed or attrs.get('data-embed') or 'default'
+
+            bc_url = 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' % (
+                account_id, player_id, embed, video_id)
+
+            # Some brightcove videos may be embedded with video tag only and
+            # without script tag or any mentioning of brightcove at all. Such
+            # embeds are considered ambiguous since they are matched based only
+            # on data-video-id and data-account attributes and in the wild may
+            # not be brightcove embeds at all. Let's check reconstructed
+            # brightcove URLs in case of such embeds and only process valid
+            # ones. By this we ensure there is indeed a brightcove embed.
+            if not script_tag and not ie._is_valid_url(
+                    bc_url, video_id, 'possible brightcove video'):
+                continue
+
+            entries.append(bc_url)
 
         return entries
 
index 1671090f47caab4f8a02b05166645942aabafd1d..a92cbefed2f342086dbf5b37409e4bb2e6e7fd62 100644 (file)
@@ -939,6 +939,7 @@ from .srmediathek import SRMediathekIE
 from .stanfordoc import StanfordOpenClassroomIE
 from .steam import SteamIE
 from .streamable import StreamableIE
+from .streamango import StreamangoIE
 from .streamcloud import StreamcloudIE
 from .streamcz import StreamCZIE
 from .streetvoice import StreetVoiceIE
@@ -1233,7 +1234,10 @@ from .wrzuta import (
     WrzutaIE,
     WrzutaPlaylistIE,
 )
-from .wsj import WSJIE
+from .wsj import (
+    WSJIE,
+    WSJArticleIE,
+)
 from .xbef import XBefIE
 from .xboxclips import XboxClipsIE
 from .xfileshare import XFileShareIE
index 36d23d2f37181a37e3ba79db9c9e65ec42d7d960..6a34c2491ebcbef18b3c7017a0567c433acaa842 100644 (file)
@@ -449,6 +449,59 @@ class GenericIE(InfoExtractor):
                 },
             }],
         },
+        {
+            # Brightcove with UUID in videoPlayer
+            'url': 'http://www8.hp.com/cn/zh/home.html',
+            'info_dict': {
+                'id': '5255815316001',
+                'ext': 'mp4',
+                'title': 'Sprocket Video - China',
+                'description': 'Sprocket Video - China',
+                'uploader': 'HP-Video Gallery',
+                'timestamp': 1482263210,
+                'upload_date': '20161220',
+                'uploader_id': '1107601872001',
+            },
+            'params': {
+                'skip_download': True,  # m3u8 download
+            },
+            'skip': 'video rotates...weekly?',
+        },
+        {
+            # Brightcove:new type [2].
+            'url': 'http://www.delawaresportszone.com/video-st-thomas-more-earns-first-trip-to-basketball-semis',
+            'md5': '2b35148fcf48da41c9fb4591650784f3',
+            'info_dict': {
+                'id': '5348741021001',
+                'ext': 'mp4',
+                'upload_date': '20170306',
+                'uploader_id': '4191638492001',
+                'timestamp': 1488769918,
+                'title': 'VIDEO:  St. Thomas More earns first trip to basketball semis',
+
+            },
+        },
+        {
+            # Alternative brightcove <video> attributes
+            'url': 'http://www.programme-tv.net/videos/extraits/81095-guillaume-canet-evoque-les-rumeurs-d-infidelite-de-marion-cotillard-avec-brad-pitt-dans-vivement-dimanche/',
+            'info_dict': {
+                'id': '81095-guillaume-canet-evoque-les-rumeurs-d-infidelite-de-marion-cotillard-avec-brad-pitt-dans-vivement-dimanche',
+                'title': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche, Extraits : toutes les vidéos avec Télé-Loisirs",
+            },
+            'playlist': [{
+                'md5': '732d22ba3d33f2f3fc253c39f8f36523',
+                'info_dict': {
+                    'id': '5311302538001',
+                    'ext': 'mp4',
+                    'title': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche",
+                    'description': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche (France 2, 5 février 2017)",
+                    'timestamp': 1486321708,
+                    'upload_date': '20170205',
+                    'uploader_id': '800000640001',
+                },
+                'only_matching': True,
+            }],
+        },
         {
             # Brightcove with UUID in videoPlayer
             'url': 'http://www8.hp.com/cn/zh/home.html',
@@ -1640,7 +1693,7 @@ class GenericIE(InfoExtractor):
                 continue
 
             entries.append({
-                '_type': 'url',
+                '_type': 'url_transparent',
                 'url': next_url,
                 'title': it.find('title').text,
             })
@@ -1900,7 +1953,6 @@ class GenericIE(InfoExtractor):
         # Look for Brightcove Legacy Studio embeds
         bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)
         if bc_urls:
-            self.to_screen('Brightcove video detected.')
             entries = [{
                 '_type': 'url',
                 'url': smuggle_url(bc_url, {'Referer': url}),
@@ -1915,7 +1967,7 @@ class GenericIE(InfoExtractor):
             }
 
         # Look for Brightcove New Studio embeds
-        bc_urls = BrightcoveNewIE._extract_urls(webpage)
+        bc_urls = BrightcoveNewIE._extract_urls(self, webpage)
         if bc_urls:
             return self.playlist_from_matches(bc_urls, video_id, video_title, ie='BrightcoveNew')
 
diff --git a/youtube_dl/extractor/streamango.py b/youtube_dl/extractor/streamango.py
new file mode 100644 (file)
index 0000000..aa4fad1
--- /dev/null
@@ -0,0 +1,64 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+    determine_ext,
+    int_or_none,
+    js_to_json,
+)
+
+
+class StreamangoIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?streamango\.com/(?:f|embed)/(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'https://streamango.com/f/clapasobsptpkdfe/20170315_150006_mp4',
+        'md5': 'e992787515a182f55e38fc97588d802a',
+        'info_dict': {
+            'id': 'clapasobsptpkdfe',
+            'ext': 'mp4',
+            'title': '20170315_150006.mp4',
+        }
+    }, {
+        'url': 'https://streamango.com/embed/clapasobsptpkdfe/20170315_150006_mp4',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        title = self._og_search_title(webpage)
+
+        formats = []
+        for format_ in re.findall(r'({[^}]*\bsrc\s*:\s*[^}]*})', webpage):
+            video = self._parse_json(
+                format_, video_id, transform_source=js_to_json, fatal=False)
+            if not video:
+                continue
+            src = video.get('src')
+            if not src:
+                continue
+            ext = determine_ext(src, default_ext=None)
+            if video.get('type') == 'application/dash+xml' or ext == 'mpd':
+                formats.extend(self._extract_mpd_formats(
+                    src, video_id, mpd_id='dash', fatal=False))
+            else:
+                formats.append({
+                    'url': src,
+                    'ext': ext or 'mp4',
+                    'width': int_or_none(video.get('width')),
+                    'height': int_or_none(video.get('height')),
+                    'tbr': int_or_none(video.get('bitrate')),
+                })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'url': url,
+            'title': title,
+            'formats': formats,
+        }
index deb7483ae51699df4670675db4622503320f1cbc..45cfca7c52e92fa8f4aa8e06e633d5198e3066d3 100644 (file)
@@ -10,12 +10,14 @@ from ..utils import (
 
 
 class WSJIE(InfoExtractor):
-    _VALID_URL = r'''(?x)https?://
-        (?:
-            video-api\.wsj\.com/api-video/player/iframe\.html\?guid=|
-            (?:www\.)?wsj\.com/video/[^/]+/
-        )
-        (?P<id>[a-zA-Z0-9-]+)'''
+    _VALID_URL = r'''(?x)
+                        (?:
+                            https?://video-api\.wsj\.com/api-video/player/iframe\.html\?.*?\bguid=|
+                            https?://(?:www\.)?wsj\.com/video/[^/]+/|
+                            wsj:
+                        )
+                        (?P<id>[a-fA-F0-9-]{36})
+                    '''
     IE_DESC = 'Wall Street Journal'
     _TESTS = [{
         'url': 'http://video-api.wsj.com/api-video/player/iframe.html?guid=1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A',
@@ -38,12 +40,17 @@ class WSJIE(InfoExtractor):
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
-        api_url = (
-            'http://video-api.wsj.com/api-video/find_all_videos.asp?'
-            'type=guid&count=1&query=%s&fields=type,hls,videoMP4List,'
-            'thumbnailList,author,description,name,duration,videoURL,'
-            'titletag,formattedCreationDate,keywords,editor' % video_id)
-        info = self._download_json(api_url, video_id)['items'][0]
+        info = self._download_json(
+            'http://video-api.wsj.com/api-video/find_all_videos.asp', video_id,
+            query={
+                'type': 'guid',
+                'count': 1,
+                'query': video_id,
+                'fields': ','.join((
+                    'type', 'hls', 'videoMP4List', 'thumbnailList', 'author',
+                    'description', 'name', 'duration', 'videoURL', 'titletag',
+                    'formattedCreationDate', 'keywords', 'editor')),
+            })['items'][0]
         title = info.get('name', info.get('titletag'))
 
         formats = []
@@ -87,3 +94,24 @@ class WSJIE(InfoExtractor):
             'title': title,
             'categories': info.get('keywords'),
         }
+
+
+class WSJArticleIE(InfoExtractor):
+    _VALID_URL = r'(?i)https?://(?:www\.)?wsj\.com/articles/(?P<id>[^/?#&]+)'
+    _TEST = {
+        'url': 'https://www.wsj.com/articles/dont-like-china-no-pandas-for-you-1490366939?',
+        'info_dict': {
+            'id': '4B13FA62-1D8C-45DB-8EA1-4105CB20B362',
+            'ext': 'mp4',
+            'upload_date': '20170221',
+            'uploader_id': 'ralcaraz',
+            'title': 'Bao Bao the Panda Leaves for China',
+        }
+    }
+
+    def _real_extract(self, url):
+        article_id = self._match_id(url)
+        webpage = self._download_webpage(url, article_id)
+        video_id = self._search_regex(
+            r'data-src=["\']([a-fA-F0-9-]{36})', webpage, 'video id')
+        return self.url_result('wsj:%s' % video_id, WSJIE.ie_key(), video_id)
index 612b50f7b2a33bf6e4cadcc8e8eb6753bd726d45..8b01fbc0af4fbf70d2093725f309d72194f3d461 100644 (file)
@@ -1,3 +1,3 @@
 from __future__ import unicode_literals
 
-__version__ = '2017.04.15'
+__version__ = '2017.04.16'