Merge pull request #8228 from jaimeMF/disable-file-handler
authorSergey M <dstftw@gmail.com>
Thu, 14 Jan 2016 17:20:02 +0000 (22:20 +0500)
committerSergey M <dstftw@gmail.com>
Thu, 14 Jan 2016 17:20:02 +0000 (22:20 +0500)
[YoutubeDL] urlopen: disable the 'file:' protocol (#8227)

AUTHORS
docs/supportedsites.md
youtube_dl/extractor/__init__.py
youtube_dl/extractor/beeg.py
youtube_dl/extractor/canvas.py [new file with mode: 0644]
youtube_dl/extractor/ntvde.py
youtube_dl/extractor/orf.py
youtube_dl/extractor/prosiebensat1.py
youtube_dl/extractor/unistra.py
youtube_dl/extractor/vodlocker.py
youtube_dl/version.py

diff --git a/AUTHORS b/AUTHORS
index 3d8bebbb06098836809857316233a9007a69840f..f4238e1d34112356ef212116b9c3a1854316b3f2 100644 (file)
--- a/AUTHORS
+++ b/AUTHORS
@@ -152,3 +152,4 @@ Evan Lu
 flatgreen
 Brian Foley
 Vignesh Venkat
+Tom Gijselinck
index 8d0c7b97a3c2dff65f9838c8a2f0c3f36837eee8..eb160bd2fafac9d80b297fd0089ef4d966f44885 100644 (file)
@@ -65,6 +65,7 @@
  - **Beeg**
  - **BehindKink**
  - **Bet**
+ - **Bigflix**
  - **Bild**: Bild.de
  - **BiliBili**
  - **BleacherReport**
  - **Instagram**
  - **instagram:user**: Instagram user profile
  - **InternetVideoArchive**
- - **IPrima**
+ - **IPrima** (Currently broken)
  - **iqiyi**: 爱奇艺
  - **Ir90Tv**
  - **ivi**: ivi.ru
  - **TruTube**
  - **Tube8**
  - **TubiTv**
- - **Tudou**
+ - **tudou**
+ - **tudou:album**
+ - **tudou:playlist**
  - **Tumblr**
  - **tunein:clip**
  - **tunein:program**
  - **video.mit.edu**
  - **VideoDetective**
  - **videofy.me**
- - **VideoMega**
+ - **VideoMega** (Currently broken)
  - **videomore**
  - **videomore:season**
  - **videomore:video**
  - **VideoPremium**
- - **VideoTt**: video.tt - Your True Tube
+ - **VideoTt**: video.tt - Your True Tube (Currently broken)
  - **videoweed**: VideoWeed
  - **Vidme**
  - **Vidzi**
index e4ae9332d28ccf180cca39bf852547489dc03db4..5621c9eb0ffd5556077cf626be261df237fb7e4a 100644 (file)
@@ -86,6 +86,7 @@ from .camdemy import (
 )
 from .canalplus import CanalplusIE
 from .canalc2 import Canalc2IE
+from .canvas import CanvasIE
 from .cbs import CBSIE
 from .cbsnews import CBSNewsIE
 from .cbssports import CBSSportsIE
index c8d921daf1c7c25d548945fbb104cb5b7181a538..34c2a756fba11f81516e87e095ef1e02d5e65417 100644 (file)
@@ -34,7 +34,7 @@ class BeegIE(InfoExtractor):
         video_id = self._match_id(url)
 
         video = self._download_json(
-            'http://beeg.com/api/v5/video/%s' % video_id, video_id)
+            'https://api.beeg.com/api/v5/video/%s' % video_id, video_id)
 
         def split(o, e):
             def cut(s, x):
@@ -60,7 +60,7 @@ class BeegIE(InfoExtractor):
 
         def decrypt_url(encrypted_url):
             encrypted_url = self._proto_relative_url(
-                encrypted_url.replace('{DATA_MARKERS}', ''), 'http:')
+                encrypted_url.replace('{DATA_MARKERS}', ''), 'https:')
             key = self._search_regex(
                 r'/key=(.*?)%2Cend=', encrypted_url, 'key', default=None)
             if not key:
diff --git a/youtube_dl/extractor/canvas.py b/youtube_dl/extractor/canvas.py
new file mode 100644 (file)
index 0000000..ee19ff8
--- /dev/null
@@ -0,0 +1,65 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import float_or_none
+
+
+class CanvasIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?canvas\.be/video/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+    _TEST = {
+        'url': 'http://www.canvas.be/video/de-afspraak/najaar-2015/de-afspraak-veilt-voor-de-warmste-week',
+        'md5': 'ea838375a547ac787d4064d8c7860a6c',
+        'info_dict': {
+            'id': 'mz-ast-5e5f90b6-2d72-4c40-82c2-e134f884e93e',
+            'display_id': 'de-afspraak-veilt-voor-de-warmste-week',
+            'ext': 'mp4',
+            'title': 'De afspraak veilt voor de Warmste Week',
+            'description': 'md5:24cb860c320dc2be7358e0e5aa317ba6',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'duration': 49.02,
+        }
+    }
+
+    def _real_extract(self, url):
+        display_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, display_id)
+
+        title = self._search_regex(
+            r'<h1[^>]+class="video__body__header__title"[^>]*>(.+?)</h1>',
+            webpage, 'title', default=None) or self._og_search_title(webpage)
+
+        video_id = self._html_search_regex(
+            r'data-video=(["\'])(?P<id>.+?)\1', webpage, 'video id', group='id')
+
+        data = self._download_json(
+            'https://mediazone.vrt.be/api/v1/canvas/assets/%s' % video_id, display_id)
+
+        formats = []
+        for target in data['targetUrls']:
+            format_url, format_type = target.get('url'), target.get('type')
+            if not format_url or not format_type:
+                continue
+            if format_type == 'HLS':
+                formats.extend(self._extract_m3u8_formats(
+                    format_url, display_id, entry_protocol='m3u8_native',
+                    ext='mp4', preference=0, fatal=False, m3u8_id=format_type))
+            elif format_type == 'HDS':
+                formats.extend(self._extract_f4m_formats(
+                    format_url, display_id, f4m_id=format_type, fatal=False))
+            else:
+                formats.append({
+                    'format_id': format_type,
+                    'url': format_url,
+                })
+        self._sort_formats(formats)
+
+        return {
+            'id': video_id,
+            'display_id': display_id,
+            'title': title,
+            'description': self._og_search_description(webpage),
+            'formats': formats,
+            'duration': float_or_none(data.get('duration'), 1000),
+            'thumbnail': data.get('posterImageUrl'),
+        }
index d2cfe096192f6a44fd0f9dad2ece1474a0d845ab..a83e85cb8109ef44468851355f2b522e22fc5831 100644 (file)
@@ -2,6 +2,7 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
+from ..compat import compat_urlparse
 from ..utils import (
     int_or_none,
     js_to_json,
@@ -34,7 +35,7 @@ class NTVDeIE(InfoExtractor):
         webpage = self._download_webpage(url, video_id)
 
         info = self._parse_json(self._search_regex(
-            r'(?s)ntv.pageInfo.article =\s(\{.*?\});', webpage, 'info'),
+            r'(?s)ntv\.pageInfo\.article\s*=\s*(\{.*?\});', webpage, 'info'),
             video_id, transform_source=js_to_json)
         timestamp = int_or_none(info.get('publishedDateAsUnixTimeStamp'))
         vdata = self._parse_json(self._search_regex(
@@ -42,18 +43,24 @@ class NTVDeIE(InfoExtractor):
             webpage, 'player data'),
             video_id, transform_source=js_to_json)
         duration = parse_duration(vdata.get('duration'))
-        formats = [{
-            'format_id': 'flash',
-            'url': 'rtmp://fms.n-tv.de/' + vdata['video'],
-        }, {
-            'format_id': 'mobile',
-            'url': 'http://video.n-tv.de' + vdata['videoMp4'],
-            'tbr': 400,  # estimation
-        }]
-        m3u8_url = 'http://video.n-tv.de' + vdata['videoM3u8']
-        formats.extend(self._extract_m3u8_formats(
-            m3u8_url, video_id, ext='mp4',
-            entry_protocol='m3u8_native', preference=0))
+
+        formats = []
+        if vdata.get('video'):
+            formats.append({
+                'format_id': 'flash',
+                'url': 'rtmp://fms.n-tv.de/%s' % vdata['video'],
+            })
+        if vdata.get('videoMp4'):
+            formats.append({
+                'format_id': 'mobile',
+                'url': compat_urlparse.urljoin('http://video.n-tv.de', vdata['videoMp4']),
+                'tbr': 400,  # estimation
+            })
+        if vdata.get('videoM3u8'):
+            m3u8_url = compat_urlparse.urljoin('http://video.n-tv.de', vdata['videoM3u8'])
+            formats.extend(self._extract_m3u8_formats(
+                m3u8_url, video_id, ext='mp4', entry_protocol='m3u8_native',
+                preference=0, m3u8_id='hls', fatal=False))
         self._sort_formats(formats)
 
         return {
index 2e6c9872b5d251be4eb3c61addab113aad4d2416..c54775d54ab628203beb41c0dd52705c35821a55 100644 (file)
@@ -170,7 +170,21 @@ class ORFOE1IE(InfoExtractor):
 class ORFFM4IE(InfoExtractor):
     IE_NAME = 'orf:fm4'
     IE_DESC = 'radio FM4'
-    _VALID_URL = r'http://fm4\.orf\.at/7tage/?#(?P<date>[0-9]+)/(?P<show>\w+)'
+    _VALID_URL = r'http://fm4\.orf\.at/(?:7tage/?#|player/)(?P<date>[0-9]+)/(?P<show>\w+)'
+
+    _TEST = {
+        'url': 'http://fm4.orf.at/player/20160110/IS/',
+        'md5': '01e736e8f1cef7e13246e880a59ad298',
+        'info_dict': {
+            'id': '2016-01-10_2100_tl_54_7DaysSun13_11244',
+            'ext': 'mp3',
+            'title': 'Im Sumpf',
+            'description': 'md5:384c543f866c4e422a55f66a62d669cd',
+            'duration': 7173,
+            'timestamp': 1452456073,
+            'upload_date': '20160110',
+        },
+    }
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
index baa54a3afd10244ef2cda281c0785b24fe19e9cb..670e6950f3fcc67e78ecd466475e7317c8dfd826 100644 (file)
@@ -20,7 +20,7 @@ from ..utils import (
 class ProSiebenSat1IE(InfoExtractor):
     IE_NAME = 'prosiebensat1'
     IE_DESC = 'ProSiebenSat.1 Digital'
-    _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|the-voice-of-germany)\.(?:de|at|ch)|ran\.de|fem\.com)/(?P<id>.+)'
+    _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|the-voice-of-germany|7tv)\.(?:de|at|ch)|ran\.de|fem\.com)/(?P<id>.+)'
 
     _TESTS = [
         {
@@ -32,7 +32,7 @@ class ProSiebenSat1IE(InfoExtractor):
             'url': 'http://www.prosieben.de/tv/circus-halligalli/videos/218-staffel-2-episode-18-jahresrueckblick-ganze-folge',
             'info_dict': {
                 'id': '2104602',
-                'ext': 'mp4',
+                'ext': 'flv',
                 'title': 'Episode 18 - Staffel 2',
                 'description': 'md5:8733c81b702ea472e069bc48bb658fc1',
                 'upload_date': '20131231',
@@ -138,14 +138,13 @@ class ProSiebenSat1IE(InfoExtractor):
             'url': 'http://www.the-voice-of-germany.de/video/31-andreas-kuemmert-rocket-man-clip',
             'info_dict': {
                 'id': '2572814',
-                'ext': 'mp4',
+                'ext': 'flv',
                 'title': 'Andreas Kümmert: Rocket Man',
                 'description': 'md5:6ddb02b0781c6adf778afea606652e38',
                 'upload_date': '20131017',
                 'duration': 469.88,
             },
             'params': {
-                # rtmp download
                 'skip_download': True,
             },
         },
@@ -153,13 +152,12 @@ class ProSiebenSat1IE(InfoExtractor):
             'url': 'http://www.fem.com/wellness/videos/wellness-video-clip-kurztripps-zum-valentinstag.html',
             'info_dict': {
                 'id': '2156342',
-                'ext': 'mp4',
+                'ext': 'flv',
                 'title': 'Kurztrips zum Valentinstag',
-                'description': 'Romantischer Kurztrip zum Valentinstag? Wir verraten, was sich hier wirklich lohnt.',
+                'description': 'Romantischer Kurztrip zum Valentinstag? Nina Heinemann verrät, was sich hier wirklich lohnt.',
                 'duration': 307.24,
             },
             'params': {
-                # rtmp download
                 'skip_download': True,
             },
         },
@@ -172,12 +170,26 @@ class ProSiebenSat1IE(InfoExtractor):
             },
             'playlist_count': 2,
         },
+        {
+            'url': 'http://www.7tv.de/circus-halligalli/615-best-of-circus-halligalli-ganze-folge',
+            'info_dict': {
+                'id': '4187506',
+                'ext': 'flv',
+                'title': 'Best of Circus HalliGalli',
+                'description': 'md5:8849752efd90b9772c9db6fdf87fb9e9',
+                'upload_date': '20151229',
+            },
+            'params': {
+                'skip_download': True,
+            },
+        },
     ]
 
     _CLIPID_REGEXES = [
         r'"clip_id"\s*:\s+"(\d+)"',
         r'clipid: "(\d+)"',
         r'clip[iI]d=(\d+)',
+        r'clip[iI]d\s*=\s*["\'](\d+)',
         r"'itemImageUrl'\s*:\s*'/dynamic/thumbnails/full/\d+/(\d+)",
     ]
     _TITLE_REGEXES = [
@@ -186,12 +198,16 @@ class ProSiebenSat1IE(InfoExtractor):
         r'<!-- start video -->\s*<h1>(.+?)</h1>',
         r'<h1 class="att-name">\s*(.+?)</h1>',
         r'<header class="module_header">\s*<h2>([^<]+)</h2>\s*</header>',
+        r'<h2 class="video-title" itemprop="name">\s*(.+?)</h2>',
+        r'<div[^>]+id="veeseoTitle"[^>]*>(.+?)</div>',
     ]
     _DESCRIPTION_REGEXES = [
         r'<p itemprop="description">\s*(.+?)</p>',
         r'<div class="videoDecription">\s*<p><strong>Beschreibung</strong>: (.+?)</p>',
         r'<div class="g-plusone" data-size="medium"></div>\s*</div>\s*</header>\s*(.+?)\s*<footer>',
         r'<p class="att-description">\s*(.+?)\s*</p>',
+        r'<p class="video-description" itemprop="description">\s*(.+?)</p>',
+        r'<div[^>]+id="veeseoDescription"[^>]*>(.+?)</div>',
     ]
     _UPLOAD_DATE_REGEXES = [
         r'<meta property="og:published_time" content="(.+?)">',
index f70978299ac9e682f5cdb99a7396541fd08c115c..594bee4f9a681f928f37887270b062b6e7079514 100644 (file)
@@ -38,7 +38,7 @@ class UnistraIE(InfoExtractor):
 
         webpage = self._download_webpage(url, video_id)
 
-        files = set(re.findall(r'file\s*:\s*"([^"]+)"', webpage))
+        files = set(re.findall(r'file\s*:\s*"(/[^"]+)"', webpage))
 
         quality = qualities(['SD', 'HD'])
         formats = []
index 357594a11debd4e4946e6fd29b0f2b4d4fb241b9..a97995a6dfd92383c2d25ea7250142404176ecad 100644 (file)
@@ -5,12 +5,13 @@ from .common import InfoExtractor
 from ..compat import compat_urllib_parse
 from ..utils import (
     ExtractorError,
+    NO_DEFAULT,
     sanitized_Request,
 )
 
 
 class VodlockerIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?vodlocker\.com/(?:embed-)?(?P<id>[0-9a-zA-Z]+)(?:\..*?)?'
+    _VALID_URL = r'https?://(?:www\.)?vodlocker\.(?:com|city)/(?:embed-)?(?P<id>[0-9a-zA-Z]+)(?:\..*?)?'
 
     _TESTS = [{
         'url': 'http://vodlocker.com/e8wvyzz4sl42',
@@ -43,16 +44,31 @@ class VodlockerIE(InfoExtractor):
             webpage = self._download_webpage(
                 req, video_id, 'Downloading video page')
 
+        def extract_file_url(html, default=NO_DEFAULT):
+            return self._search_regex(
+                r'file:\s*"(http[^\"]+)",', html, 'file url', default=default)
+
+        video_url = extract_file_url(webpage, default=None)
+
+        if not video_url:
+            embed_url = self._search_regex(
+                r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?vodlocker\.(?:com|city)/embed-.+?)\1',
+                webpage, 'embed url', group='url')
+            embed_webpage = self._download_webpage(
+                embed_url, video_id, 'Downloading embed webpage')
+            video_url = extract_file_url(embed_webpage)
+            thumbnail_webpage = embed_webpage
+        else:
+            thumbnail_webpage = webpage
+
         title = self._search_regex(
             r'id="file_title".*?>\s*(.*?)\s*<(?:br|span)', webpage, 'title')
         thumbnail = self._search_regex(
-            r'image:\s*"(http[^\"]+)",', webpage, 'thumbnail')
-        url = self._search_regex(
-            r'file:\s*"(http[^\"]+)",', webpage, 'file url')
+            r'image:\s*"(http[^\"]+)",', thumbnail_webpage, 'thumbnail', fatal=False)
 
         formats = [{
             'format_id': 'sd',
-            'url': url,
+            'url': video_url,
         }]
 
         return {
index 7030903c0aa4f7fc17c8f185fe31c6c1d3b0e450..4d433b6678842a59b34563491e3c552c607b28b1 100644 (file)
@@ -1,3 +1,3 @@
 from __future__ import unicode_literals
 
-__version__ = '2016.01.09'
+__version__ = '2016.01.14'