Merge pull request #5780 from jaimeMF/remove-nondash
authorSergey M. <dstftw@gmail.com>
Sun, 24 May 2015 16:42:15 +0000 (21:42 +0500)
committerSergey M. <dstftw@gmail.com>
Sun, 24 May 2015 16:42:15 +0000 (21:42 +0500)
[youtube] Remove the nondash formats (fixes #5774)

15 files changed:
AUTHORS
README.md
youtube_dl/YoutubeDL.py
youtube_dl/extractor/__init__.py
youtube_dl/extractor/arte.py
youtube_dl/extractor/cnn.py
youtube_dl/extractor/drtv.py
youtube_dl/extractor/empflix.py
youtube_dl/extractor/karrierevideos.py [new file with mode: 0644]
youtube_dl/extractor/nextmedia.py
youtube_dl/extractor/prosiebensat1.py
youtube_dl/extractor/rtbf.py
youtube_dl/extractor/rutv.py
youtube_dl/extractor/tnaflix.py
youtube_dl/options.py

diff --git a/AUTHORS b/AUTHORS
index 267b8da1e6ffbda7853d53996de4020111074834..ebed7ebb3a3222492111d7579ae5bdd10096cf6b 100644 (file)
--- a/AUTHORS
+++ b/AUTHORS
@@ -124,3 +124,4 @@ Mohammad Teimori Pabandi
 Roman Le Négrate
 Matthias Küch
 Julian Richen
+Ping O.
index 3d9436456c4bd29d11bf160efa27769b05edde2d..e51bb534341e389a26a466f1fb4c3ef721731016 100644 (file)
--- a/README.md
+++ b/README.md
@@ -17,12 +17,12 @@ youtube-dl - download videos from youtube.com or other video platforms
 To install it right away for all UNIX users (Linux, OS X, etc.), type:
 
     sudo curl https://yt-dl.org/latest/youtube-dl -o /usr/local/bin/youtube-dl
-    sudo chmod a+x /usr/local/bin/youtube-dl
+    sudo chmod a+rx /usr/local/bin/youtube-dl
 
 If you do not have curl, you can alternatively use a recent wget:
 
     sudo wget https://yt-dl.org/downloads/latest/youtube-dl -O /usr/local/bin/youtube-dl
-    sudo chmod a+x /usr/local/bin/youtube-dl
+    sudo chmod a+rx /usr/local/bin/youtube-dl
 
 Windows users can [download a .exe file](https://yt-dl.org/latest/youtube-dl.exe) and place it in their home directory or any other location on their [PATH](http://en.wikipedia.org/wiki/PATH_%28variable%29).
 
index 58b34e087b421474112719ffb3389b252e560313..d1953c18f39b438740aec88a1aadf4d529a8e0b4 100755 (executable)
@@ -1527,6 +1527,7 @@ class YoutubeDL(object):
             pps_chain.extend(ie_info['__postprocessors'])
         pps_chain.extend(self._pps)
         for pp in pps_chain:
+            files_to_delete = []
             try:
                 files_to_delete, info = pp.run(info)
             except PostProcessingError as e:
index 24efb7ce50e27e371c84d1e85b3b6314f0624026..79bcd910666baf7a9802dc50f7ae072827976b52 100644 (file)
@@ -244,6 +244,7 @@ from .kaltura import KalturaIE
 from .kanalplay import KanalPlayIE
 from .kankan import KankanIE
 from .karaoketv import KaraoketvIE
+from .karrierevideos import KarriereVideosIE
 from .keezmovies import KeezMoviesIE
 from .khanacademy import KhanAcademyIE
 from .kickstarter import KickStarterIE
@@ -338,8 +339,7 @@ from .newstube import NewstubeIE
 from .nextmedia import (
     NextMediaIE,
     NextMediaActionNewsIE,
-    AppleDailyRealtimeNewsIE,
-    AppleDailyAnimationNewsIE
+    AppleDailyIE,
 )
 from .nfb import NFBIE
 from .nfl import NFLIE
index 8273bd6c9ae3cdff82052c8f63efc68be97561b3..fce38248d1eaf7410d828a9821d4123332d94983 100644 (file)
@@ -195,7 +195,9 @@ class ArteTVFutureIE(ArteTVPlus7IE):
     def _real_extract(self, url):
         anchor_id, lang = self._extract_url_info(url)
         webpage = self._download_webpage(url, anchor_id)
-        row = get_element_by_id(anchor_id, webpage)
+        row = self._search_regex(
+            r'(?s)id="%s"[^>]*>.+?(<div[^>]*arte_vp_url[^>]*>)' % anchor_id,
+            webpage, 'row')
         return self._extract_from_webpage(row, anchor_id, lang)
 
 
index 5efc5f4fe556a4424542b441a83f2d6dbd5bc8e7..3b1bd4033fd1c01986c83ab44cc1cebaa1b19e5b 100644 (file)
@@ -12,7 +12,7 @@ from ..utils import (
 
 class CNNIE(InfoExtractor):
     _VALID_URL = r'''(?x)https?://(?:(?:edition|www)\.)?cnn\.com/video/(?:data/.+?|\?)/
-        (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z]{3,5})(?:-ap)?|(?=&)))'''
+        (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:[a-z\-]+)|(?=&)))'''
 
     _TESTS = [{
         'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn',
index f25ab319e66d4d5b151cd9a9d4509807b6a88617..baa24c6d13abe016cceb83bb927db15d7d300509 100644 (file)
@@ -1,8 +1,11 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-from .common import InfoExtractor, ExtractorError
-from ..utils import parse_iso8601
+from .common import InfoExtractor
+from ..utils import (
+    ExtractorError,
+    parse_iso8601,
+)
 
 
 class DRTVIE(InfoExtractor):
@@ -60,19 +63,31 @@ class DRTVIE(InfoExtractor):
                 restricted_to_denmark = asset['RestrictedToDenmark']
                 spoken_subtitles = asset['Target'] == 'SpokenSubtitles'
                 for link in asset['Links']:
-                    target = link['Target']
                     uri = link['Uri']
+                    target = link['Target']
                     format_id = target
-                    preference = -1 if target == 'HDS' else -2
+                    preference = None
                     if spoken_subtitles:
-                        preference -= 2
+                        preference = -1
                         format_id += '-spoken-subtitles'
-                    formats.append({
-                        'url': uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43' if target == 'HDS' else uri,
-                        'format_id': format_id,
-                        'ext': link['FileFormat'],
-                        'preference': preference,
-                    })
+                    if target == 'HDS':
+                        formats.extend(self._extract_f4m_formats(
+                            uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43',
+                            video_id, preference, f4m_id=format_id))
+                    elif target == 'HLS':
+                        formats.extend(self._extract_m3u8_formats(
+                            uri, video_id, 'mp4', preference=preference,
+                            m3u8_id=format_id))
+                    else:
+                        bitrate = link.get('Bitrate')
+                        if bitrate:
+                            format_id += '-%s' % bitrate
+                        formats.append({
+                            'url': uri,
+                            'format_id': format_id,
+                            'tbr': bitrate,
+                            'ext': link.get('FileFormat'),
+                        })
                 subtitles_list = asset.get('SubtitlesList')
                 if isinstance(subtitles_list, list):
                     LANGS = {
index 70f8efe27578c4d43b27378a4a2c80d495a7488c..9a5a8f4bb44039e6c52968801033a3d12a73d835 100644 (file)
@@ -4,22 +4,28 @@ from .tnaflix import TNAFlixIE
 
 
 class EMPFlixIE(TNAFlixIE):
-    _VALID_URL = r'^https?://www\.empflix\.com/videos/(?P<display_id>[0-9a-zA-Z-]+)-(?P<id>[0-9]+)\.html'
+    _VALID_URL = r'https?://(?:www\.)?empflix\.com/videos/(?P<display_id>.+?)-(?P<id>[0-9]+)\.html'
 
     _TITLE_REGEX = r'name="title" value="(?P<title>[^"]*)"'
     _DESCRIPTION_REGEX = r'name="description" value="([^"]*)"'
     _CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"'
 
-    _TEST = {
-        'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html',
-        'md5': 'b1bc15b6412d33902d6e5952035fcabc',
-        'info_dict': {
-            'id': '33051',
-            'display_id': 'Amateur-Finger-Fuck',
-            'ext': 'mp4',
-            'title': 'Amateur Finger Fuck',
-            'description': 'Amateur solo finger fucking.',
-            'thumbnail': 're:https?://.*\.jpg$',
-            'age_limit': 18,
+    _TESTS = [
+        {
+            'url': 'http://www.empflix.com/videos/Amateur-Finger-Fuck-33051.html',
+            'md5': 'b1bc15b6412d33902d6e5952035fcabc',
+            'info_dict': {
+                'id': '33051',
+                'display_id': 'Amateur-Finger-Fuck',
+                'ext': 'mp4',
+                'title': 'Amateur Finger Fuck',
+                'description': 'Amateur solo finger fucking.',
+                'thumbnail': 're:https?://.*\.jpg$',
+                'age_limit': 18,
+            }
+        },
+        {
+            'url': 'http://www.empflix.com/videos/[AROMA][ARMD-718]-Aoi-Yoshino-Sawa-25826.html',
+            'matching_only': True,
         }
-    }
+    ]
diff --git a/youtube_dl/extractor/karrierevideos.py b/youtube_dl/extractor/karrierevideos.py
new file mode 100644 (file)
index 0000000..bed94bc
--- /dev/null
@@ -0,0 +1,96 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_urlparse
+from ..utils import (
+    fix_xml_ampersands,
+    float_or_none,
+    xpath_with_ns,
+    xpath_text,
+)
+
+
+class KarriereVideosIE(InfoExtractor):
+    _VALID_URL = r'http://(?:www\.)?karrierevideos\.at(?:/[^/]+)+/(?P<id>[^/]+)'
+    _TESTS = [{
+        'url': 'http://www.karrierevideos.at/berufsvideos/mittlere-hoehere-schulen/altenpflegerin',
+        'info_dict': {
+            'id': '32c91',
+            'ext': 'flv',
+            'title': 'AltenpflegerIn',
+            'description': 'md5:dbadd1259fde2159a9b28667cb664ae2',
+            'thumbnail': 're:^http://.*\.png',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        }
+    }, {
+        # broken ampersands
+        'url': 'http://www.karrierevideos.at/orientierung/vaeterkarenz-und-neue-chancen-fuer-muetter-baby-was-nun',
+        'info_dict': {
+            'id': '5sniu',
+            'ext': 'flv',
+            'title': 'Väterkarenz und neue Chancen für Mütter - "Baby - was nun?"',
+            'description': 'md5:97092c6ad1fd7d38e9d6a5fdeb2bcc33',
+            'thumbnail': 're:^http://.*\.png',
+        },
+        'params': {
+            # rtmp download
+            'skip_download': True,
+        }
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        title = (self._html_search_meta('title', webpage, default=None) or
+                 self._search_regex(r'<h1 class="title">([^<]+)</h1>'))
+
+        video_id = self._search_regex(
+            r'/config/video/(.+?)\.xml', webpage, 'video id')
+        playlist = self._download_xml(
+            'http://www.karrierevideos.at/player-playlist.xml.php?p=%s' % video_id,
+            video_id, transform_source=fix_xml_ampersands)
+
+        NS_MAP = {
+            'jwplayer': 'http://developer.longtailvideo.com/trac/wiki/FlashFormats'
+        }
+
+        def ns(path):
+            return xpath_with_ns(path, NS_MAP)
+
+        item = playlist.find('./tracklist/item')
+        video_file = xpath_text(
+            item, ns('./jwplayer:file'), 'video url', fatal=True)
+        streamer = xpath_text(
+            item, ns('./jwplayer:streamer'), 'streamer', fatal=True)
+
+        uploader = xpath_text(
+            item, ns('./jwplayer:author'), 'uploader')
+        duration = float_or_none(
+            xpath_text(item, ns('./jwplayer:duration'), 'duration'))
+
+        description = self._html_search_regex(
+            r'(?s)<div class="leadtext">(.+?)</div>',
+            webpage, 'description')
+
+        thumbnail = self._html_search_meta(
+            'thumbnail', webpage, 'thumbnail')
+        if thumbnail:
+            thumbnail = compat_urlparse.urljoin(url, thumbnail)
+
+        return {
+            'id': video_id,
+            'url': streamer.replace('rtmpt', 'rtmp'),
+            'play_path': 'mp4:%s' % video_file,
+            'ext': 'flv',
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'uploader': uploader,
+            'duration': duration,
+        }
index 02dba4ef639e64deff790f94bd5cd0cd6da2a4cb..d1b7cff4cfbf30c76c52ae98ad247e8907be6abd 100644 (file)
@@ -89,8 +89,8 @@ class NextMediaActionNewsIE(NextMediaIE):
         return self._extract_from_nextmedia_page(news_id, url, article_page)
 
 
-class AppleDailyRealtimeNewsIE(NextMediaIE):
-    _VALID_URL = r'http://(www|ent).appledaily.com.tw/(realtimenews|enews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?'
+class AppleDailyIE(NextMediaIE):
+    _VALID_URL = r'http://(www|ent).appledaily.com.tw/(?:animation|appledaily|enews|realtimenews)/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?'
     _TESTS = [{
         'url': 'http://ent.appledaily.com.tw/enews/article/entertainment/20150128/36354694',
         'md5': 'a843ab23d150977cc55ef94f1e2c1e4d',
@@ -99,7 +99,7 @@ class AppleDailyRealtimeNewsIE(NextMediaIE):
             'ext': 'mp4',
             'title': '周亭羽走過摩鐵陰霾2男陪吃 九把刀孤寒看醫生',
             'thumbnail': 're:^https?://.*\.jpg$',
-            'description': 'md5:b23787119933404ce515c6356a8c355c',
+            'description': 'md5:2acd430e59956dc47cd7f67cb3c003f4',
             'upload_date': '20150128',
         }
     }, {
@@ -110,26 +110,10 @@ class AppleDailyRealtimeNewsIE(NextMediaIE):
             'ext': 'mp4',
             'title': '不滿被踩腳 山東兩大媽一路打下車',
             'thumbnail': 're:^https?://.*\.jpg$',
-            'description': 'md5:2648aaf6fc4f401f6de35a91d111aa1d',
+            'description': 'md5:175b4260c1d7c085993474217e4ab1b4',
             'upload_date': '20150128',
         }
-    }]
-
-    _URL_PATTERN = r'\{url: \'(.+)\'\}'
-
-    def _fetch_title(self, page):
-        return self._html_search_regex(r'<h1 id="h1">([^<>]+)</h1>', page, 'news title')
-
-    def _fetch_thumbnail(self, page):
-        return self._html_search_regex(r"setInitialImage\(\'([^']+)'\)", page, 'video thumbnail', fatal=False)
-
-    def _fetch_timestamp(self, page):
-        return None
-
-
-class AppleDailyAnimationNewsIE(AppleDailyRealtimeNewsIE):
-    _VALID_URL = 'http://www.appledaily.com.tw/animation/[^/]+/[^/]+/(?P<date>\d+)/(?P<id>\d+)(/.*)?'
-    _TESTS = [{
+    }, {
         'url': 'http://www.appledaily.com.tw/animation/realtimenews/new/20150128/5003671',
         'md5': '03df296d95dedc2d5886debbb80cb43f',
         'info_dict': {
@@ -154,10 +138,22 @@ class AppleDailyAnimationNewsIE(AppleDailyRealtimeNewsIE):
         'expected_warnings': [
             'video thumbnail',
         ]
+    }, {
+        'url': 'http://www.appledaily.com.tw/appledaily/article/supplement/20140417/35770334/',
+        'only_matching': True,
     }]
 
+    _URL_PATTERN = r'\{url: \'(.+)\'\}'
+
     def _fetch_title(self, page):
-        return self._html_search_meta('description', page, 'news title')
+        return (self._html_search_regex(r'<h1 id="h1">([^<>]+)</h1>', page, 'news title', default=None) or
+                self._html_search_meta('description', page, 'news title'))
+
+    def _fetch_thumbnail(self, page):
+        return self._html_search_regex(r"setInitialImage\(\'([^']+)'\)", page, 'video thumbnail', fatal=False)
+
+    def _fetch_timestamp(self, page):
+        return None
 
     def _fetch_description(self, page):
         return self._html_search_meta('description', page, 'news description')
index 7cc7996642cae1de1ca2a585391d167025b92162..255d4abc131519ec470ccdc2b1a64b7d38d9f44b 100644 (file)
@@ -17,7 +17,7 @@ from ..utils import (
 class ProSiebenSat1IE(InfoExtractor):
     IE_NAME = 'prosiebensat1'
     IE_DESC = 'ProSiebenSat.1 Digital'
-    _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|ran|the-voice-of-germany)\.de|fem\.com)/(?P<id>.+)'
+    _VALID_URL = r'https?://(?:www\.)?(?:(?:prosieben|prosiebenmaxx|sixx|sat1|kabeleins|the-voice-of-germany)\.(?:de|at)|ran\.de|fem\.com)/(?P<id>.+)'
 
     _TESTS = [
         {
index dce64e1517003015722db1097ac83b106cc91136..5a381d9ced41516db44d7e17120b29948a1957cb 100644 (file)
@@ -1,10 +1,11 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
-import re
-import json
-
 from .common import InfoExtractor
+from ..utils import (
+    int_or_none,
+    unescapeHTML,
+)
 
 
 class RTBFIE(InfoExtractor):
@@ -16,25 +17,24 @@ class RTBFIE(InfoExtractor):
             'id': '1921274',
             'ext': 'mp4',
             'title': 'Les Diables au coeur (épisode 2)',
-            'description': 'Football - Diables Rouges',
             'duration': 3099,
-            'timestamp': 1398456336,
-            'upload_date': '20140425',
         }
     }
 
     def _real_extract(self, url):
-        mobj = re.match(self._VALID_URL, url)
-        video_id = mobj.group('id')
+        video_id = self._match_id(url)
 
-        page = self._download_webpage('https://www.rtbf.be/video/embed?id=%s' % video_id, video_id)
+        webpage = self._download_webpage(
+            'http://www.rtbf.be/video/embed?id=%s' % video_id, video_id)
 
-        data = json.loads(self._html_search_regex(
-            r'<div class="js-player-embed(?: player-embed)?" data-video="([^"]+)"', page, 'data video'))['data']
+        data = self._parse_json(
+            unescapeHTML(self._search_regex(
+                r'data-video="([^"]+)"', webpage, 'data video')),
+            video_id)
 
         video_url = data.get('downloadUrl') or data.get('url')
 
-        if data['provider'].lower() == 'youtube':
+        if data.get('provider').lower() == 'youtube':
             return self.url_result(video_url, 'Youtube')
 
         return {
@@ -42,8 +42,8 @@ class RTBFIE(InfoExtractor):
             'url': video_url,
             'title': data['title'],
             'description': data.get('description') or data.get('subtitle'),
-            'thumbnail': data['thumbnail']['large'],
+            'thumbnail': data.get('thumbnail'),
             'duration': data.get('duration') or data.get('realDuration'),
-            'timestamp': data['created'],
-            'view_count': data['viewCount'],
+            'timestamp': int_or_none(data.get('created')),
+            'view_count': int_or_none(data.get('viewCount')),
         }
index 55604637dca22533cd765529dcb2abfb759fd9c1..d9df0686133a6772deb1e58260069857620afc58 100644 (file)
@@ -104,7 +104,7 @@ class RUTVIE(InfoExtractor):
     @classmethod
     def _extract_url(cls, webpage):
         mobj = re.search(
-            r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.rutv\.ru/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage)
+            r'<iframe[^>]+?src=(["\'])(?P<url>https?://player\.(?:rutv\.ru|vgtrk\.com)/(?:iframe/(?:swf|video|live)/id|index/iframe/cast_id)/.+?)\1', webpage)
         if mobj:
             return mobj.group('url')
 
index d48cbbf140054e639f7191acfa0909972ef3ab76..59af9aba06399cefcc6c2049c958dfb3819bb20a 100644 (file)
@@ -10,26 +10,32 @@ from ..utils import (
 
 
 class TNAFlixIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/(?P<cat_id>[\w-]+)/(?P<display_id>[\w-]+)/video(?P<id>\d+)'
+    _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P<display_id>[^/]+)/video(?P<id>\d+)'
 
     _TITLE_REGEX = r'<title>(.+?) - TNAFlix Porn Videos</title>'
     _DESCRIPTION_REGEX = r'<h3 itemprop="description">([^<]+)</h3>'
     _CONFIG_REGEX = r'flashvars\.config\s*=\s*escape\("([^"]+)"'
 
-    _TEST = {
-        'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878',
-        'md5': 'ecf3498417d09216374fc5907f9c6ec0',
-        'info_dict': {
-            'id': '553878',
-            'display_id': 'Carmella-Decesare-striptease',
-            'ext': 'mp4',
-            'title': 'Carmella Decesare - striptease',
-            'description': '',
-            'thumbnail': 're:https?://.*\.jpg$',
-            'duration': 91,
-            'age_limit': 18,
+    _TESTS = [
+        {
+            'url': 'http://www.tnaflix.com/porn-stars/Carmella-Decesare-striptease/video553878',
+            'md5': 'ecf3498417d09216374fc5907f9c6ec0',
+            'info_dict': {
+                'id': '553878',
+                'display_id': 'Carmella-Decesare-striptease',
+                'ext': 'mp4',
+                'title': 'Carmella Decesare - striptease',
+                'description': '',
+                'thumbnail': 're:https?://.*\.jpg$',
+                'duration': 91,
+                'age_limit': 18,
+            }
+        },
+        {
+            'url': 'https://www.tnaflix.com/amateur-porn/bunzHD-Ms.Donk/video358632',
+            'matching_only': True,
         }
-    }
+    ]
 
     def _real_extract(self, url):
         mobj = re.match(self._VALID_URL, url)
index dd07266b7b5e726ab568cb49a152ad702935db51..5a2315bd96ce0c6abfdf4a8bea65aa68e6fa370b 100644 (file)
@@ -537,7 +537,7 @@ def parseOpts(overrideArguments=None):
     verbosity.add_option(
         '--dump-pages', '--dump-intermediate-pages',
         action='store_true', dest='dump_intermediate_pages', default=False,
-        help='Print downloaded pages to debug problems (very verbose)')
+        help='Print downloaded pages encoded using base64 to debug problems (very verbose)')
     verbosity.add_option(
         '--write-pages',
         action='store_true', dest='write_pages', default=False,