Merge branch 'varzesh3' of https://github.com/mtp1376/youtube-dl into mtp1376-varzesh3
authorSergey M․ <dstftw@gmail.com>
Wed, 1 Apr 2015 14:25:46 +0000 (20:25 +0600)
committerSergey M․ <dstftw@gmail.com>
Wed, 1 Apr 2015 14:25:46 +0000 (20:25 +0600)
38 files changed:
AUTHORS
Makefile
devscripts/check-porn.py
docs/supportedsites.md
test/test_all_urls.py
test/test_utils.py
youtube_dl/YoutubeDL.py
youtube_dl/compat.py
youtube_dl/downloader/common.py
youtube_dl/downloader/http.py
youtube_dl/downloader/rtmp.py
youtube_dl/extractor/__init__.py
youtube_dl/extractor/cnn.py
youtube_dl/extractor/dailymotion.py
youtube_dl/extractor/dhm.py [new file with mode: 0644]
youtube_dl/extractor/douyutv.py
youtube_dl/extractor/dumpert.py [new file with mode: 0644]
youtube_dl/extractor/eroprofile.py
youtube_dl/extractor/generic.py
youtube_dl/extractor/nbc.py
youtube_dl/extractor/phoenix.py
youtube_dl/extractor/pornhub.py
youtube_dl/extractor/safari.py [new file with mode: 0644]
youtube_dl/extractor/slideshare.py
youtube_dl/extractor/soundcloud.py
youtube_dl/extractor/teamcoco.py
youtube_dl/extractor/theplatform.py
youtube_dl/extractor/ultimedia.py
youtube_dl/extractor/vessel.py [new file with mode: 0644]
youtube_dl/extractor/vimeo.py
youtube_dl/extractor/xuite.py
youtube_dl/extractor/yahoo.py
youtube_dl/extractor/youporn.py
youtube_dl/extractor/youtube.py
youtube_dl/postprocessor/ffmpeg.py
youtube_dl/update.py
youtube_dl/utils.py
youtube_dl/version.py

diff --git a/AUTHORS b/AUTHORS
index 59f1b5f21345b95622d4c21af7354f355dc782f8..48be31e29f821c49f9aafed970b9a8c68a97e069 100644 (file)
--- a/AUTHORS
+++ b/AUTHORS
@@ -118,3 +118,4 @@ Devin J. Pohly
 Eduardo Ferro Aldama
 Jeff Buchbinder
 Amish Bhadeshia
+Joram Schrijver
index c6c76274f995a85185290d35868b974c13240aa2..fdb1abb60cacfe49295a7438e3d0f4f51c248359 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@ all: youtube-dl README.md CONTRIBUTING.md README.txt youtube-dl.1 youtube-dl.bas
 
 clean:
        rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish *.dump *.part *.info.json *.mp4 *.flv *.mp3 *.avi CONTRIBUTING.md.tmp youtube-dl youtube-dl.exe
-       find -name "*.pyc" -delete
+       find -name "*.pyc" -delete
 
 PREFIX ?= /usr/local
 BINDIR ?= $(PREFIX)/bin
index 6a5bd9eda333246c47064bf84cfc03da09de4caf..7a219ebe97c555be79a55cf4dc30bf2cb823ca28 100644 (file)
@@ -28,7 +28,7 @@ for test in get_testcases():
     if METHOD == 'EURISTIC':
         try:
             webpage = compat_urllib_request.urlopen(test['url'], timeout=10).read()
-        except:
+        except Exception:
             print('\nFail: {0}'.format(test['name']))
             continue
 
index baf7b3880fd6180f19cdf44f04a9d38d2e0d4062..fd59cc2be40563c86104200f13897c110a9d01f3 100644 (file)
@@ -2,6 +2,8 @@
  - **1tv**: Первый канал
  - **1up.com**
  - **220.ro**
+ - **22tracks:genre**
+ - **22tracks:track**
  - **24video**
  - **3sat**
  - **4tube**
  - **rutube:movie**: Rutube movies
  - **rutube:person**: Rutube person videos
  - **RUTV**: RUTV.RU
+ - **safari**: safaribooksonline.com online video
+ - **safari:course**: safaribooksonline.com online courses
  - **Sandia**: Sandia National Laboratories
  - **Sapo**: SAPO Vídeos
  - **savefrom.net**
  - **Vbox7**
  - **VeeHD**
  - **Veoh**
+ - **Vessel**
  - **Vesti**: Вести.Ru
  - **Vevo**
  - **VGTV**
  - **youtube:show**: YouTube.com (multi-season) shows
  - **youtube:subscriptions**: YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)
  - **youtube:user**: YouTube.com user videos (URL or "ytuser" keyword)
- - **youtube:watch_later**: Youtube watch later list, ":ytwatchlater" for short (requires authentication)
+ - **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication)
  - **Zapiks**
  - **ZDF**
  - **ZDFChannel**
index 6ae168b7f472938a3a69344fac123e047478ee73..a9db42b300864180c10dca730f772f7f5a26aad8 100644 (file)
@@ -59,7 +59,7 @@ class TestAllURLsMatching(unittest.TestCase):
         self.assertMatch('www.youtube.com/NASAgovVideo/videos', ['youtube:user'])
 
     def test_youtube_feeds(self):
-        self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:watch_later'])
+        self.assertMatch('https://www.youtube.com/feed/watch_later', ['youtube:watchlater'])
         self.assertMatch('https://www.youtube.com/feed/subscriptions', ['youtube:subscriptions'])
         self.assertMatch('https://www.youtube.com/feed/recommended', ['youtube:recommended'])
         self.assertMatch('https://www.youtube.com/my_favorites', ['youtube:favorites'])
index a8ab876850d3645eb615d552162128befdac316d..abaf1ab7331eab044c684b592bd5bfb85f4d39e0 100644 (file)
@@ -200,6 +200,8 @@ class TestUtil(unittest.TestCase):
 
     def test_unescape_html(self):
         self.assertEqual(unescapeHTML('%20;'), '%20;')
+        self.assertEqual(unescapeHTML('&#x2F;'), '/')
+        self.assertEqual(unescapeHTML('&#47;'), '/')
         self.assertEqual(
             unescapeHTML('&eacute;'), 'é')
 
index b5ef5e009f972cb9308bf5c09c561ce393c05a2a..640b8c99d75d6215b232c0d2d678dd903f41e8bb 100755 (executable)
@@ -1701,10 +1701,10 @@ class YoutubeDL(object):
             out = out.decode().strip()
             if re.match('[0-9a-f]+', out):
                 self._write_string('[debug] Git HEAD: ' + out + '\n')
-        except:
+        except Exception:
             try:
                 sys.exc_clear()
-            except:
+            except Exception:
                 pass
         self._write_string('[debug] Python version %s - %s\n' % (
             platform.python_version(), platform_name()))
index b2bf149ef63ffb1c31bfb02f976f5cce2dbadad3..973bcd32074107f70c1b781e95b97ef34501b88f 100644 (file)
@@ -389,7 +389,7 @@ else:
                 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
             out, err = sp.communicate()
             lines, columns = map(int, out.split())
-        except:
+        except Exception:
             pass
         return _terminal_size(columns, lines)
 
index 8ed5c19a6b8079c9807faa4fa7a505a398974975..a0fc5ead06a4e8adba7886d2a4087d28ca9cdbcb 100644 (file)
@@ -204,7 +204,7 @@ class FileDownloader(object):
             return
         try:
             os.utime(filename, (time.time(), filetime))
-        except:
+        except Exception:
             pass
         return filetime
 
@@ -318,7 +318,7 @@ class FileDownloader(object):
         )
 
         continuedl_and_exists = (
-            self.params.get('continuedl', False) and
+            self.params.get('continuedl', True) and
             os.path.isfile(encodeFilename(filename)) and
             not self.params.get('nopart', False)
         )
index 4047d7167478b34c9cacc79dcbbfeee1bb31d317..d136bebd1fe45761312bd90c31a95ddaf1754271 100644 (file)
@@ -49,7 +49,7 @@ class HttpFD(FileDownloader):
 
         open_mode = 'wb'
         if resume_len != 0:
-            if self.params.get('continuedl', False):
+            if self.params.get('continuedl', True):
                 self.report_resuming_byte(resume_len)
                 request.add_header('Range', 'bytes=%d-' % resume_len)
                 open_mode = 'ab'
index 89e98ae61e128c80eab5b0e04109b1baa2ecff7e..ddf5724ae6f05259194c67473bfd212ddeb896ff 100644 (file)
@@ -105,7 +105,7 @@ class RtmpFD(FileDownloader):
         protocol = info_dict.get('rtmp_protocol', None)
         real_time = info_dict.get('rtmp_real_time', False)
         no_resume = info_dict.get('no_resume', False)
-        continue_dl = info_dict.get('continuedl', False)
+        continue_dl = info_dict.get('continuedl', True)
 
         self.report_destination(filename)
         tmpfilename = self.temp_name(filename)
index 3011b784d2e6a0a7114b204a423b55399b083174..f3ebe4ee2bd3fac7e6add7e146a882a574c9f71a 100644 (file)
@@ -106,6 +106,7 @@ from .dbtv import DBTVIE
 from .dctp import DctpTvIE
 from .deezer import DeezerPlaylistIE
 from .dfb import DFBIE
+from .dhm import DHMIE
 from .dotsub import DotsubIE
 from .douyutv import DouyuTVIE
 from .dreisat import DreiSatIE
@@ -114,6 +115,7 @@ from .drtuber import DrTuberIE
 from .drtv import DRTVIE
 from .dvtv import DVTVIE
 from .dump import DumpIE
+from .dumpert import DumpertIE
 from .defense import DefenseGouvFrIE
 from .discovery import DiscoveryIE
 from .divxstage import DivxStageIE
@@ -309,6 +311,8 @@ from .nba import NBAIE
 from .nbc import (
     NBCIE,
     NBCNewsIE,
+    NBCSportsIE,
+    NBCSportsVPlayerIE,
 )
 from .ndr import NDRIE
 from .ndtv import NDTVIE
@@ -420,6 +424,10 @@ from .rutube import (
 )
 from .rutv import RUTVIE
 from .sandia import SandiaIE
+from .safari import (
+    SafariIE,
+    SafariCourseIE,
+)
 from .sapo import SapoIE
 from .savefrom import SaveFromIE
 from .sbs import SBSIE
@@ -552,6 +560,7 @@ from .varzesh3 import Varzesh3IE
 from .vbox7 import Vbox7IE
 from .veehd import VeeHDIE
 from .veoh import VeohIE
+from .vessel import VesselIE
 from .vesti import VestiIE
 from .vevo import VevoIE
 from .vgtv import VGTVIE
index 90ea074387ef6afe4aaa87a41c13ec6cf5a1aa7b..0a77e951c52ebe9b3a1307203e31d0658c2c89cc 100644 (file)
@@ -12,7 +12,7 @@ from ..utils import (
 
 class CNNIE(InfoExtractor):
     _VALID_URL = r'''(?x)https?://(?:(?:edition|www)\.)?cnn\.com/video/(?:data/.+?|\?)/
-        (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:cnn|hln)(?:-ap)?|(?=&)))'''
+        (?P<path>.+?/(?P<title>[^/]+?)(?:\.(?:cnn|hln|ktvk)(?:-ap)?|(?=&)))'''
 
     _TESTS = [{
         'url': 'http://edition.cnn.com/video/?/video/sports/2013/06/09/nadal-1-on-1.cnn',
@@ -45,6 +45,9 @@ class CNNIE(InfoExtractor):
             'description': 'md5:e7223a503315c9f150acac52e76de086',
             'upload_date': '20141222',
         }
+    }, {
+        'url': 'http://cnn.com/video/?/video/politics/2015/03/27/pkg-arizona-senator-church-attendance-mandatory.ktvk',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
index 4f67c3aacc5dec5410b76d08728eecaf429f222b..47d58330b6cfd5b1df32374e9debb11a89347554 100644 (file)
@@ -25,8 +25,7 @@ class DailymotionBaseInfoExtractor(InfoExtractor):
     def _build_request(url):
         """Build a request with the family filter disabled"""
         request = compat_urllib_request.Request(url)
-        request.add_header('Cookie', 'family_filter=off')
-        request.add_header('Cookie', 'ff=off')
+        request.add_header('Cookie', 'family_filter=off; ff=off')
         return request
 
 
@@ -112,8 +111,9 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
             video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1)
 
         embed_url = 'http://www.dailymotion.com/embed/video/%s' % video_id
-        embed_page = self._download_webpage(embed_url, video_id,
-                                            'Downloading embed page')
+        embed_request = self._build_request(embed_url)
+        embed_page = self._download_webpage(
+            embed_request, video_id, 'Downloading embed page')
         info = self._search_regex(r'var info = ({.*?}),$', embed_page,
                                   'video info', flags=re.MULTILINE)
         info = json.loads(info)
diff --git a/youtube_dl/extractor/dhm.py b/youtube_dl/extractor/dhm.py
new file mode 100644 (file)
index 0000000..3ed1f16
--- /dev/null
@@ -0,0 +1,73 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+    xpath_text,
+    parse_duration,
+)
+
+
+class DHMIE(InfoExtractor):
+    IE_DESC = 'Filmarchiv - Deutsches Historisches Museum'
+    _VALID_URL = r'https?://(?:www\.)?dhm\.de/filmarchiv/(?:[^/]+/)+(?P<id>[^/]+)'
+
+    _TESTS = [{
+        'url': 'http://www.dhm.de/filmarchiv/die-filme/the-marshallplan-at-work-in-west-germany/',
+        'md5': '11c475f670209bf6acca0b2b7ef51827',
+        'info_dict': {
+            'id': 'the-marshallplan-at-work-in-west-germany',
+            'ext': 'flv',
+            'title': 'MARSHALL PLAN AT WORK IN WESTERN GERMANY, THE',
+            'description': 'md5:1fabd480c153f97b07add61c44407c82',
+            'duration': 660,
+            'thumbnail': 're:^https?://.*\.jpg$',
+        },
+    }, {
+        'url': 'http://www.dhm.de/filmarchiv/02-mapping-the-wall/peter-g/rolle-1/',
+        'md5': '09890226332476a3e3f6f2cb74734aa5',
+        'info_dict': {
+            'id': 'rolle-1',
+            'ext': 'flv',
+            'title': 'ROLLE 1',
+            'thumbnail': 're:^https?://.*\.jpg$',
+        },
+    }]
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+
+        playlist_url = self._search_regex(
+            r"file\s*:\s*'([^']+)'", webpage, 'playlist url')
+
+        playlist = self._download_xml(playlist_url, video_id)
+
+        track = playlist.find(
+            './{http://xspf.org/ns/0/}trackList/{http://xspf.org/ns/0/}track')
+
+        video_url = xpath_text(
+            track, './{http://xspf.org/ns/0/}location',
+            'video url', fatal=True)
+        thumbnail = xpath_text(
+            track, './{http://xspf.org/ns/0/}image',
+            'thumbnail')
+
+        title = self._search_regex(
+            [r'dc:title="([^"]+)"', r'<title> &raquo;([^<]+)</title>'],
+            webpage, 'title').strip()
+        description = self._html_search_regex(
+            r'<p><strong>Description:</strong>(.+?)</p>',
+            webpage, 'description', default=None)
+        duration = parse_duration(self._search_regex(
+            r'<em>Length\s*</em>\s*:\s*</strong>([^<]+)',
+            webpage, 'duration', default=None))
+
+        return {
+            'id': video_id,
+            'url': video_url,
+            'title': title,
+            'description': description,
+            'duration': duration,
+            'thumbnail': thumbnail,
+        }
index d7956e6e4a20d733947f70a2933e06c2f0d144de..479430c51072ab91e976df4d459af372c5608cdd 100644 (file)
@@ -1,19 +1,23 @@
 # coding: utf-8
 from __future__ import unicode_literals
 
+import hashlib
+import time
 from .common import InfoExtractor
-from ..utils import ExtractorError
+from ..utils import (ExtractorError, unescapeHTML)
+from ..compat import (compat_str, compat_basestring)
 
 
 class DouyuTVIE(InfoExtractor):
     _VALID_URL = r'http://(?:www\.)?douyutv\.com/(?P<id>[A-Za-z0-9]+)'
-    _TEST = {
+    _TESTS = [{
         'url': 'http://www.douyutv.com/iseven',
         'info_dict': {
-            'id': 'iseven',
+            'id': '17732',
+            'display_id': 'iseven',
             'ext': 'flv',
             'title': 're:^清晨醒脑!T-ara根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
-            'description': 'md5:9e525642c25a0a24302869937cf69d17',
+            'description': 'md5:c93d6692dde6fe33809a46edcbecca44',
             'thumbnail': 're:^https?://.*\.jpg$',
             'uploader': '7师傅',
             'uploader_id': '431925',
@@ -22,22 +26,52 @@ class DouyuTVIE(InfoExtractor):
         'params': {
             'skip_download': True,
         }
-    }
+    }, {
+        'url': 'http://www.douyutv.com/85982',
+        'info_dict': {
+            'id': '85982',
+            'display_id': '85982',
+            'ext': 'flv',
+            'title': 're:^小漠从零单排记!——CSOL2躲猫猫 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+            'description': 'md5:746a2f7a253966a06755a912f0acc0d2',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'uploader': 'douyu小漠',
+            'uploader_id': '3769985',
+            'is_live': True,
+        },
+        'params': {
+            'skip_download': True,
+        }
+    }]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
 
+        if video_id.isdigit():
+            room_id = video_id
+        else:
+            page = self._download_webpage(url, video_id)
+            room_id = self._html_search_regex(
+                r'"room_id"\s*:\s*(\d+),', page, 'room id')
+
+        prefix = 'room/%s?aid=android&client_sys=android&time=%d' % (
+            room_id, int(time.time()))
+
+        auth = hashlib.md5((prefix + '1231').encode('ascii')).hexdigest()
         config = self._download_json(
-            'http://www.douyutv.com/api/client/room/%s' % video_id, video_id)
+            'http://www.douyutv.com/api/v1/%s&auth=%s' % (prefix, auth),
+            video_id)
 
         data = config['data']
 
         error_code = config.get('error', 0)
-        show_status = data.get('show_status')
         if error_code is not 0:
-            raise ExtractorError(
-                'Server reported error %i' % error_code, expected=True)
+            error_desc = 'Server reported error %i' % error_code
+            if isinstance(data, (compat_str, compat_basestring)):
+                error_desc += ': ' + data
+            raise ExtractorError(error_desc, expected=True)
 
+        show_status = data.get('show_status')
         # 1 = live, 2 = offline
         if show_status == '2':
             raise ExtractorError(
@@ -46,7 +80,7 @@ class DouyuTVIE(InfoExtractor):
         base_url = data['rtmp_url']
         live_path = data['rtmp_live']
 
-        title = self._live_title(data['room_name'])
+        title = self._live_title(unescapeHTML(data['room_name']))
         description = data.get('show_details')
         thumbnail = data.get('room_src')
 
@@ -66,7 +100,8 @@ class DouyuTVIE(InfoExtractor):
         self._sort_formats(formats)
 
         return {
-            'id': video_id,
+            'id': room_id,
+            'display_id': video_id,
             'title': title,
             'description': description,
             'thumbnail': thumbnail,
diff --git a/youtube_dl/extractor/dumpert.py b/youtube_dl/extractor/dumpert.py
new file mode 100644 (file)
index 0000000..e43bc81
--- /dev/null
@@ -0,0 +1,56 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import base64
+
+from .common import InfoExtractor
+from ..utils import qualities
+
+
+class DumpertIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?dumpert\.nl/mediabase/(?P<id>[0-9]+/[0-9a-zA-Z]+)'
+    _TEST = {
+        'url': 'http://www.dumpert.nl/mediabase/6646981/951bc60f/',
+        'md5': '1b9318d7d5054e7dcb9dc7654f21d643',
+        'info_dict': {
+            'id': '6646981/951bc60f',
+            'ext': 'mp4',
+            'title': 'Ik heb nieuws voor je',
+            'description': 'Niet schrikken hoor',
+            'thumbnail': 're:^https?://.*\.jpg$',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+
+        files_base64 = self._search_regex(
+            r'data-files="([^"]+)"', webpage, 'data files')
+
+        files = self._parse_json(
+            base64.b64decode(files_base64.encode('utf-8')).decode('utf-8'),
+            video_id)
+
+        quality = qualities(['flv', 'mobile', 'tablet', '720p'])
+
+        formats = [{
+            'url': video_url,
+            'format_id': format_id,
+            'quality': quality(format_id),
+        } for format_id, video_url in files.items() if format_id != 'still']
+        self._sort_formats(formats)
+
+        title = self._html_search_meta(
+            'title', webpage) or self._og_search_title(webpage)
+        description = self._html_search_meta(
+            'description', webpage) or self._og_search_description(webpage)
+        thumbnail = files.get('still') or self._og_search_thumbnail(webpage)
+
+        return {
+            'id': video_id,
+            'title': title,
+            'description': description,
+            'thumbnail': thumbnail,
+            'formats': formats
+        }
index 79e2fbd394681283e07a7146bc51f39a7499324d..0cbca90b061cf2358600146f37f6da5b61d71709 100644 (file)
@@ -1,11 +1,17 @@
 from __future__ import unicode_literals
 
+import re
+
 from .common import InfoExtractor
+from ..compat import compat_urllib_parse
+from ..utils import ExtractorError
 
 
 class EroProfileIE(InfoExtractor):
     _VALID_URL = r'https?://(?:www\.)?eroprofile\.com/m/videos/view/(?P<id>[^/]+)'
-    _TEST = {
+    _LOGIN_URL = 'http://www.eroprofile.com/auth/auth.php?'
+    _NETRC_MACHINE = 'eroprofile'
+    _TESTS = [{
         'url': 'http://www.eroprofile.com/m/videos/view/sexy-babe-softcore',
         'md5': 'c26f351332edf23e1ea28ce9ec9de32f',
         'info_dict': {
@@ -16,13 +22,55 @@ class EroProfileIE(InfoExtractor):
             'thumbnail': 're:https?://.*\.jpg',
             'age_limit': 18,
         }
-    }
+    }, {
+        'url': 'http://www.eroprofile.com/m/videos/view/Try-It-On-Pee_cut_2-wmv-4shared-com-file-sharing-download-movie-file',
+        'md5': '1baa9602ede46ce904c431f5418d8916',
+        'info_dict': {
+            'id': '1133519',
+            'ext': 'm4v',
+            'title': 'Try It On Pee_cut_2.wmv - 4shared.com - file sharing - download movie file',
+            'thumbnail': 're:https?://.*\.jpg',
+            'age_limit': 18,
+        },
+        'skip': 'Requires login',
+    }]
+
+    def _login(self):
+        (username, password) = self._get_login_info()
+        if username is None:
+            return
+
+        query = compat_urllib_parse.urlencode({
+            'username': username,
+            'password': password,
+            'url': 'http://www.eroprofile.com/',
+        })
+        login_url = self._LOGIN_URL + query
+        login_page = self._download_webpage(login_url, None, False)
+
+        m = re.search(r'Your username or password was incorrect\.', login_page)
+        if m:
+            raise ExtractorError(
+                'Wrong username and/or password.', expected=True)
+
+        self.report_login()
+        redirect_url = self._search_regex(
+            r'<script[^>]+?src="([^"]+)"', login_page, 'login redirect url')
+        self._download_webpage(redirect_url, None, False)
+
+    def _real_initialize(self):
+        self._login()
 
     def _real_extract(self, url):
         display_id = self._match_id(url)
 
         webpage = self._download_webpage(url, display_id)
 
+        m = re.search(r'You must be logged in to view this video\.', webpage)
+        if m:
+            raise ExtractorError(
+                'This video requires login. Please specify a username and password and try again.', expected=True)
+
         video_id = self._search_regex(
             [r"glbUpdViews\s*\('\d*','(\d+)'", r'p/report/video/(\d+)'],
             webpage, 'video id', default=None)
index 8a49b0b54a86329bd821817dead195c768000246..2ff002643c9e4404b3427f4f309c187186ccc37a 100644 (file)
@@ -29,6 +29,7 @@ from ..utils import (
     xpath_text,
 )
 from .brightcove import BrightcoveIE
+from .nbc import NBCSportsVPlayerIE
 from .ooyala import OoyalaIE
 from .rutv import RUTVIE
 from .smotri import SmotriIE
@@ -620,6 +621,16 @@ class GenericIE(InfoExtractor):
                 'age_limit': 0,
             },
         },
+        # 5min embed
+        {
+            'url': 'http://techcrunch.com/video/facebook-creates-on-this-day-crunch-report/518726732/',
+            'md5': '4c6f127a30736b59b3e2c19234ee2bf7',
+            'info_dict': {
+                'id': '518726732',
+                'ext': 'mp4',
+                'title': 'Facebook Creates "On This Day" | Crunch Report',
+            },
+        },
         # RSS feed with enclosure
         {
             'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml',
@@ -629,6 +640,16 @@ class GenericIE(InfoExtractor):
                 'upload_date': '20150228',
                 'title': 'pdv_maddow_netcast_m4v-02-27-2015-201624',
             }
+        },
+        # NBC Sports vplayer embed
+        {
+            'url': 'http://www.riderfans.com/forum/showthread.php?121827-Freeman&s=e98fa1ea6dc08e886b1678d35212494a',
+            'info_dict': {
+                'id': 'ln7x1qSThw4k',
+                'ext': 'flv',
+                'title': "PFT Live: New leader in the 'new-look' defense",
+                'description': 'md5:65a19b4bbfb3b0c0c5768bed1dfad74e',
+            },
         }
     ]
 
@@ -1236,6 +1257,17 @@ class GenericIE(InfoExtractor):
         if mobj is not None:
             return self.url_result(mobj.group('url'), 'Pladform')
 
+        # Look for 5min embeds
+        mobj = re.search(
+            r'<meta[^>]+property="og:video"[^>]+content="https?://embed\.5min\.com/(?P<id>[0-9]+)/?', webpage)
+        if mobj is not None:
+            return self.url_result('5min:%s' % mobj.group('id'), 'FiveMin')
+
+        # Look for NBC Sports VPlayer embeds
+        nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
+        if nbc_sports_url:
+            return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
+
         def check_video(vurl):
             if YoutubeIE.suitable(vurl):
                 return True
index 3645d3033f74ae174e3eaa85ad55bbe677d9daba..ecd0ac8b1b501d9ad97261f57a5b0fee1cd68ce7 100644 (file)
@@ -14,7 +14,7 @@ from ..utils import (
 
 
 class NBCIE(InfoExtractor):
-    _VALID_URL = r'http://www\.nbc\.com/(?:[^/]+/)+(?P<id>n?\d+)'
+    _VALID_URL = r'https?://www\.nbc\.com/(?:[^/]+/)+(?P<id>n?\d+)'
 
     _TESTS = [
         {
@@ -50,6 +50,57 @@ class NBCIE(InfoExtractor):
         return self.url_result(theplatform_url)
 
 
+class NBCSportsVPlayerIE(InfoExtractor):
+    _VALID_URL = r'https?://vplayer\.nbcsports\.com/(?:[^/]+/)+(?P<id>[0-9a-zA-Z_]+)'
+
+    _TESTS = [{
+        'url': 'https://vplayer.nbcsports.com/p/BxmELC/nbcsports_share/select/9CsDKds0kvHI',
+        'info_dict': {
+            'id': '9CsDKds0kvHI',
+            'ext': 'flv',
+            'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d',
+            'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson',
+        }
+    }, {
+        'url': 'http://vplayer.nbcsports.com/p/BxmELC/nbc_embedshare/select/_hqLjQ95yx8Z',
+        'only_matching': True,
+    }]
+
+    @staticmethod
+    def _extract_url(webpage):
+        iframe_m = re.search(
+            r'<iframe[^>]+src="(?P<url>https?://vplayer\.nbcsports\.com/[^"]+)"', webpage)
+        if iframe_m:
+            return iframe_m.group('url')
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        theplatform_url = self._og_search_video_url(webpage)
+        return self.url_result(theplatform_url, 'ThePlatform')
+
+
+class NBCSportsIE(InfoExtractor):
+    # Does not include https becuase its certificate is invalid
+    _VALID_URL = r'http://www\.nbcsports\.com//?(?:[^/]+/)+(?P<id>[0-9a-z-]+)'
+
+    _TEST = {
+        'url': 'http://www.nbcsports.com//college-basketball/ncaab/tom-izzo-michigan-st-has-so-much-respect-duke',
+        'info_dict': {
+            'id': 'PHJSaFWbrTY9',
+            'ext': 'flv',
+            'title': 'Tom Izzo, Michigan St. has \'so much respect\' for Duke',
+            'description': 'md5:ecb459c9d59e0766ac9c7d5d0eda8113',
+        }
+    }
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+        webpage = self._download_webpage(url, video_id)
+        return self.url_result(
+            NBCSportsVPlayerIE._extract_url(webpage), 'NBCSportsVPlayer')
+
+
 class NBCNewsIE(InfoExtractor):
     _VALID_URL = r'''(?x)https?://(?:www\.)?nbcnews\.com/
         (?:video/.+?/(?P<id>\d+)|
index a20672c0cc7fea5309e77bb193b887ced2b8d7d5..46cebc0d7b05080491d5f1d32ee8a709b549debc 100644 (file)
@@ -5,19 +5,33 @@ from .zdf import extract_from_xml_url
 
 
 class PhoenixIE(InfoExtractor):
-    _VALID_URL = r'https?://(?:www\.)?phoenix\.de/content/(?P<id>[0-9]+)'
-    _TEST = {
-        'url': 'http://www.phoenix.de/content/884301',
-        'md5': 'ed249f045256150c92e72dbb70eadec6',
-        'info_dict': {
-            'id': '884301',
-            'ext': 'mp4',
-            'title': 'Michael Krons mit Hans-Werner Sinn',
-            'description': 'Im Dialog - Sa. 25.10.14, 00.00 - 00.35 Uhr',
-            'upload_date': '20141025',
-            'uploader': 'Im Dialog',
-        }
-    }
+    _VALID_URL = r'''(?x)https?://(?:www\.)?phoenix\.de/content/
+        (?:
+            phoenix/die_sendungen/(?:[^/]+/)?
+        )?
+        (?P<id>[0-9]+)'''
+    _TESTS = [
+        {
+            'url': 'http://www.phoenix.de/content/884301',
+            'md5': 'ed249f045256150c92e72dbb70eadec6',
+            'info_dict': {
+                'id': '884301',
+                'ext': 'mp4',
+                'title': 'Michael Krons mit Hans-Werner Sinn',
+                'description': 'Im Dialog - Sa. 25.10.14, 00.00 - 00.35 Uhr',
+                'upload_date': '20141025',
+                'uploader': 'Im Dialog',
+            }
+        },
+        {
+            'url': 'http://www.phoenix.de/content/phoenix/die_sendungen/869815',
+            'only_matching': True,
+        },
+        {
+            'url': 'http://www.phoenix.de/content/phoenix/die_sendungen/diskussionen/928234',
+            'only_matching': True,
+        },
+    ]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
index 3a27e37890dc78b26af866c9884807c97c56ccb9..0c8b731cf47267568e43ccd09ff21f1683b4d992 100644 (file)
@@ -33,10 +33,8 @@ class PornHubIE(InfoExtractor):
     }
 
     def _extract_count(self, pattern, webpage, name):
-        count = self._html_search_regex(pattern, webpage, '%s count' % name, fatal=False)
-        if count:
-            count = str_to_int(count)
-        return count
+        return str_to_int(self._search_regex(
+            pattern, webpage, '%s count' % name, fatal=False))
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
@@ -62,11 +60,14 @@ class PornHubIE(InfoExtractor):
         if thumbnail:
             thumbnail = compat_urllib_parse.unquote(thumbnail)
 
-        view_count = self._extract_count(r'<span class="count">([\d,\.]+)</span> views', webpage, 'view')
-        like_count = self._extract_count(r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like')
-        dislike_count = self._extract_count(r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike')
+        view_count = self._extract_count(
+            r'<span class="count">([\d,\.]+)</span> views', webpage, 'view')
+        like_count = self._extract_count(
+            r'<span class="votesUp">([\d,\.]+)</span>', webpage, 'like')
+        dislike_count = self._extract_count(
+            r'<span class="votesDown">([\d,\.]+)</span>', webpage, 'dislike')
         comment_count = self._extract_count(
-            r'All comments \(<var class="videoCommentCount">([\d,\.]+)</var>', webpage, 'comment')
+            r'All Comments\s*<span>\(([\d,.]+)\)', webpage, 'comment')
 
         video_urls = list(map(compat_urllib_parse.unquote, re.findall(r'"quality_[0-9]{3}p":"([^"]+)', webpage)))
         if webpage.find('"encrypted":true') != -1:
diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py
new file mode 100644 (file)
index 0000000..10251f2
--- /dev/null
@@ -0,0 +1,157 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from .brightcove import BrightcoveIE
+
+from ..compat import (
+    compat_urllib_parse,
+    compat_urllib_request,
+)
+from ..utils import (
+    ExtractorError,
+    smuggle_url,
+    std_headers,
+)
+
+
+class SafariBaseIE(InfoExtractor):
+    _LOGIN_URL = 'https://www.safaribooksonline.com/accounts/login/'
+    _SUCCESSFUL_LOGIN_REGEX = r'<a href="/accounts/logout/"[^>]*>Sign Out</a>'
+    _ACCOUNT_CREDENTIALS_HINT = 'Use --username and --password options to supply credentials for safaribooksonline.com'
+    _NETRC_MACHINE = 'safari'
+
+    _API_BASE = 'https://www.safaribooksonline.com/api/v1/book'
+    _API_FORMAT = 'json'
+
+    LOGGED_IN = False
+
+    def _real_initialize(self):
+        # We only need to log in once for courses or individual videos
+        if not self.LOGGED_IN:
+            self._login()
+            SafariBaseIE.LOGGED_IN = True
+
+    def _login(self):
+        (username, password) = self._get_login_info()
+        if username is None:
+            raise ExtractorError(
+                self._ACCOUNT_CREDENTIALS_HINT,
+                expected=True)
+
+        headers = std_headers
+        if 'Referer' not in headers:
+            headers['Referer'] = self._LOGIN_URL
+
+        login_page = self._download_webpage(
+            self._LOGIN_URL, None,
+            'Downloading login form')
+
+        csrf = self._html_search_regex(
+            r"name='csrfmiddlewaretoken'\s+value='([^']+)'",
+            login_page, 'csrf token')
+
+        login_form = {
+            'csrfmiddlewaretoken': csrf,
+            'email': username,
+            'password1': password,
+            'login': 'Sign In',
+            'next': '',
+        }
+
+        request = compat_urllib_request.Request(
+            self._LOGIN_URL, compat_urllib_parse.urlencode(login_form), headers=headers)
+        login_page = self._download_webpage(
+            request, None, 'Logging in as %s' % username)
+
+        if re.search(self._SUCCESSFUL_LOGIN_REGEX, login_page) is None:
+            raise ExtractorError(
+                'Login failed; make sure your credentials are correct and try again.',
+                expected=True)
+
+        self.to_screen('Login successful')
+
+
+class SafariIE(SafariBaseIE):
+    IE_NAME = 'safari'
+    IE_DESC = 'safaribooksonline.com online video'
+    _VALID_URL = r'''(?x)https?://
+                            (?:www\.)?safaribooksonline\.com/
+                                (?:
+                                    library/view/[^/]+|
+                                    api/v1/book
+                                )/
+                                (?P<course_id>\d+)/
+                                    (?:chapter(?:-content)?/)?
+                                (?P<part>part\d+)\.html
+    '''
+
+    _TESTS = [{
+        'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html',
+        'md5': '5b0c4cc1b3c1ba15dda7344085aa5592',
+        'info_dict': {
+            'id': '2842601850001',
+            'ext': 'mp4',
+            'title': 'Introduction',
+        },
+        'skip': 'Requires safaribooksonline account credentials',
+    }, {
+        'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        course_id = mobj.group('course_id')
+        part = mobj.group('part')
+
+        webpage = self._download_webpage(
+            '%s/%s/chapter-content/%s.html' % (self._API_BASE, course_id, part),
+            part)
+
+        bc_url = BrightcoveIE._extract_brightcove_url(webpage)
+        if not bc_url:
+            raise ExtractorError('Could not extract Brightcove URL from %s' % url, expected=True)
+
+        return self.url_result(smuggle_url(bc_url, {'Referer': url}), 'Brightcove')
+
+
+class SafariCourseIE(SafariBaseIE):
+    IE_NAME = 'safari:course'
+    IE_DESC = 'safaribooksonline.com online courses'
+
+    _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/(?:library/view/[^/]+|api/v1/book)/(?P<id>\d+)/?(?:[#?]|$)'
+
+    _TESTS = [{
+        'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/',
+        'info_dict': {
+            'id': '9780133392838',
+            'title': 'Hadoop Fundamentals LiveLessons',
+        },
+        'playlist_count': 22,
+        'skip': 'Requires safaribooksonline account credentials',
+    }, {
+        'url': 'https://www.safaribooksonline.com/api/v1/book/9781449396459/?override_format=json',
+        'only_matching': True,
+    }]
+
+    def _real_extract(self, url):
+        course_id = self._match_id(url)
+
+        course_json = self._download_json(
+            '%s/%s/?override_format=%s' % (self._API_BASE, course_id, self._API_FORMAT),
+            course_id, 'Downloading course JSON')
+
+        if 'chapters' not in course_json:
+            raise ExtractorError(
+                'No chapters found for course %s' % course_id, expected=True)
+
+        entries = [
+            self.url_result(chapter, 'Safari')
+            for chapter in course_json['chapters']]
+
+        course_title = course_json['title']
+
+        return self.playlist_result(entries, course_id, course_title)
index 9f79ff5c1b66d2bf37369a6009a914043493b407..0b717a1e42b8dd2c3d8a88d602f001876cf99e03 100644 (file)
@@ -30,7 +30,7 @@ class SlideshareIE(InfoExtractor):
         page_title = mobj.group('title')
         webpage = self._download_webpage(url, page_title)
         slideshare_obj = self._search_regex(
-            r'var\s+slideshare_object\s*=\s*({.*?});\s*var\s+user_info\s*=',
+            r'\$\.extend\(slideshare_object,\s*(\{.*?\})\);',
             webpage, 'slideshare object')
         info = json.loads(slideshare_obj)
         if info['slideshow']['type'] != 'video':
index 9d45059723c3450a0774275ad708eed88b6e0fcb..316b2c90f110770299084889552b8137e072a617 100644 (file)
@@ -242,7 +242,7 @@ class SoundcloudIE(InfoExtractor):
 
 
 class SoundcloudSetIE(SoundcloudIE):
-    _VALID_URL = r'https?://(?:www\.)?soundcloud\.com/(?P<uploader>[\w\d-]+)/sets/(?P<slug_title>[\w\d-]+)(?:/(?P<token>[^?/]+))?'
+    _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<uploader>[\w\d-]+)/sets/(?P<slug_title>[\w\d-]+)(?:/(?P<token>[^?/]+))?'
     IE_NAME = 'soundcloud:set'
     _TESTS = [{
         'url': 'https://soundcloud.com/the-concept-band/sets/the-royal-concept-ep',
@@ -287,7 +287,7 @@ class SoundcloudSetIE(SoundcloudIE):
 
 
 class SoundcloudUserIE(SoundcloudIE):
-    _VALID_URL = r'https?://(www\.)?soundcloud\.com/(?P<user>[^/]+)/?((?P<rsrc>tracks|likes)/?)?(\?.*)?$'
+    _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/(?P<user>[^/]+)/?((?P<rsrc>tracks|likes)/?)?(\?.*)?$'
     IE_NAME = 'soundcloud:user'
     _TESTS = [{
         'url': 'https://soundcloud.com/the-concept-band',
index 7cb06f351e5b388142b00b51aeba69a1ecfef250..a46a7ecba299c2e4e4e30eb69e7a8cfb3214155d 100644 (file)
@@ -54,7 +54,7 @@ class TeamcocoIE(InfoExtractor):
             embed_url, video_id, 'Downloading embed page')
 
         player_data = self._parse_json(self._search_regex(
-            r'Y\.Ginger\.Module\.Player\((\{.*?\})\);', embed, 'player data'), video_id)
+            r'Y\.Ginger\.Module\.Player(?:;var\s*player\s*=\s*new\s*m)?\((\{.*?\})\);', embed, 'player data'), video_id)
         data = self._parse_json(
             base64.b64decode(player_data['preload'].encode('ascii')).decode('utf-8'), video_id)
 
index feac666f78baff49f4fb312a147acad67d320bc2..0e3e627f495aa5051cb96fe05f180f9790761129 100644 (file)
@@ -92,7 +92,7 @@ class ThePlatformIE(InfoExtractor):
             error_msg = next(
                 n.attrib['abstract']
                 for n in meta.findall(_x('.//smil:ref'))
-                if n.attrib.get('title') == 'Geographic Restriction')
+                if n.attrib.get('title') == 'Geographic Restriction' or n.attrib.get('title') == 'Expired')
         except StopIteration:
             pass
         else:
index 06554a1befefb938796f106b5ac45c75d942dc14..96c809eaf7155290210e0f8b18d3a2c7c948ba97 100644 (file)
@@ -42,7 +42,6 @@ class UltimediaIE(InfoExtractor):
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
-
         webpage = self._download_webpage(url, video_id)
 
         deliver_url = self._search_regex(
@@ -81,8 +80,8 @@ class UltimediaIE(InfoExtractor):
         title = clean_html((
             self._html_search_regex(
                 r'(?s)<div\s+id="catArticle">.+?</div>(.+?)</h1>',
-                webpage, 'title', default=None)
-            or self._search_regex(
+                webpage, 'title', default=None) or
+            self._search_regex(
                 r"var\s+nameVideo\s*=\s*'([^']+)'",
                 deliver_page, 'title')))
 
diff --git a/youtube_dl/extractor/vessel.py b/youtube_dl/extractor/vessel.py
new file mode 100644 (file)
index 0000000..6215f06
--- /dev/null
@@ -0,0 +1,127 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_request
+from ..utils import (
+    ExtractorError,
+    parse_iso8601,
+)
+
+
+class VesselIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?vessel\.com/videos/(?P<id>[0-9a-zA-Z]+)'
+    _API_URL_TEMPLATE = 'https://www.vessel.com/api/view/items/%s'
+    _LOGIN_URL = 'https://www.vessel.com/api/account/login'
+    _NETRC_MACHINE = 'vessel'
+    _TEST = {
+        'url': 'https://www.vessel.com/videos/HDN7G5UMs',
+        'md5': '455cdf8beb71c6dd797fd2f3818d05c4',
+        'info_dict': {
+            'id': 'HDN7G5UMs',
+            'ext': 'mp4',
+            'title': 'Nvidia GeForce GTX Titan X - The Best Video Card on the Market?',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'upload_date': '20150317',
+            'description': 'Did Nvidia pull out all the stops on the Titan X, or does its performance leave something to be desired?',
+            'timestamp': int,
+        },
+    }
+
+    @staticmethod
+    def make_json_request(url, data):
+        payload = json.dumps(data).encode('utf-8')
+        req = compat_urllib_request.Request(url, payload)
+        req.add_header('Content-Type', 'application/json; charset=utf-8')
+        return req
+
+    @staticmethod
+    def find_assets(data, asset_type):
+        for asset in data.get('assets', []):
+            if asset.get('type') == asset_type:
+                yield asset
+
+    def _check_access_rights(self, data):
+        access_info = data.get('__view', {})
+        if not access_info.get('allow_access', True):
+            err_code = access_info.get('error_code') or ''
+            if err_code == 'ITEM_PAID_ONLY':
+                raise ExtractorError(
+                    'This video requires subscription.', expected=True)
+            else:
+                raise ExtractorError(
+                    'Access to this content is restricted. (%s said: %s)' % (self.IE_NAME, err_code), expected=True)
+
+    def _login(self):
+        (username, password) = self._get_login_info()
+        if username is None:
+            return
+        self.report_login()
+        data = {
+            'client_id': 'web',
+            'type': 'password',
+            'user_key': username,
+            'password': password,
+        }
+        login_request = VesselIE.make_json_request(self._LOGIN_URL, data)
+        self._download_webpage(login_request, None, False, 'Wrong login info')
+
+    def _real_initialize(self):
+        self._login()
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+        data = self._parse_json(self._search_regex(
+            r'App\.bootstrapData\((.*?)\);', webpage, 'data'), video_id)
+        asset_id = data['model']['data']['id']
+
+        req = VesselIE.make_json_request(
+            self._API_URL_TEMPLATE % asset_id, {'client': 'web'})
+        data = self._download_json(req, video_id)
+
+        self._check_access_rights(data)
+
+        try:
+            video_asset = next(VesselIE.find_assets(data, 'video'))
+        except StopIteration:
+            raise ExtractorError('No video assets found')
+
+        formats = []
+        for f in video_asset.get('sources', []):
+            if f['name'] == 'hls-index':
+                formats.extend(self._extract_m3u8_formats(
+                    f['location'], video_id, ext='mp4', m3u8_id='m3u8'))
+            else:
+                formats.append({
+                    'format_id': f['name'],
+                    'tbr': f.get('bitrate'),
+                    'height': f.get('height'),
+                    'width': f.get('width'),
+                    'url': f['location'],
+                })
+        self._sort_formats(formats)
+
+        thumbnails = []
+        for im_asset in VesselIE.find_assets(data, 'image'):
+            thumbnails.append({
+                'url': im_asset['location'],
+                'width': im_asset.get('width', 0),
+                'height': im_asset.get('height', 0),
+            })
+
+        return {
+            'id': video_id,
+            'title': data['title'],
+            'formats': formats,
+            'thumbnails': thumbnails,
+            'description': data.get('short_description'),
+            'duration': data.get('duration'),
+            'comment_count': data.get('comment_count'),
+            'like_count': data.get('like_count'),
+            'view_count': data.get('view_count'),
+            'timestamp': parse_iso8601(data.get('released_at')),
+        }
index bd09652cd96340155cc084f6814df4fbecd6f707..28bcc89cd7423dafa40032076d1bd3ad12f4bdcf 100644 (file)
@@ -244,6 +244,16 @@ class VimeoIE(VimeoBaseInfoExtractor):
         # and latter we extract those that are Vimeo specific.
         self.report_extraction(video_id)
 
+        vimeo_config = self._search_regex(
+            r'vimeo\.config\s*=\s*({.+?});', webpage,
+            'vimeo config', default=None)
+        if vimeo_config:
+            seed_status = self._parse_json(vimeo_config, video_id).get('seed_status', {})
+            if seed_status.get('state') == 'failed':
+                raise ExtractorError(
+                    '%s returned error: %s' % (self.IE_NAME, seed_status['title']),
+                    expected=True)
+
         # Extract the config JSON
         try:
             try:
index 4971965f9d090cce61a2e8b6d1486fadc873b4dc..81d885fdcee1cf788c217e862629df58f386d73c 100644 (file)
@@ -69,18 +69,26 @@ class XuiteIE(InfoExtractor):
         'only_matching': True,
     }]
 
+    @staticmethod
+    def base64_decode_utf8(data):
+        return base64.b64decode(data.encode('utf-8')).decode('utf-8')
+
+    @staticmethod
+    def base64_encode_utf8(data):
+        return base64.b64encode(data.encode('utf-8')).decode('utf-8')
+
     def _extract_flv_config(self, media_id):
-        base64_media_id = base64.b64encode(media_id.encode('utf-8')).decode('utf-8')
+        base64_media_id = self.base64_encode_utf8(media_id)
         flv_config = self._download_xml(
             'http://vlog.xuite.net/flash/player?media=%s' % base64_media_id,
             'flv config')
         prop_dict = {}
         for prop in flv_config.findall('./property'):
-            prop_id = base64.b64decode(prop.attrib['id']).decode('utf-8')
+            prop_id = self.base64_decode_utf8(prop.attrib['id'])
             # CDATA may be empty in flv config
             if not prop.text:
                 continue
-            encoded_content = base64.b64decode(prop.text).decode('utf-8')
+            encoded_content = self.base64_decode_utf8(prop.text)
             prop_dict[prop_id] = compat_urllib_parse_unquote(encoded_content)
         return prop_dict
 
index 97dbac4cce53d7fe956b074fddbe40993fd5681f..b777159c5639304edf1433857f626c29299e4bcb 100644 (file)
@@ -17,6 +17,8 @@ from ..utils import (
     int_or_none,
 )
 
+from .nbc import NBCSportsVPlayerIE
+
 
 class YahooIE(InfoExtractor):
     IE_DESC = 'Yahoo screen and movies'
@@ -129,6 +131,15 @@ class YahooIE(InfoExtractor):
         }, {
             'url': 'https://gma.yahoo.com/pizza-delivery-man-surprised-huge-tip-college-kids-195200785.html',
             'only_matching': True,
+        }, {
+            'note': 'NBC Sports embeds',
+            'url': 'http://sports.yahoo.com/blogs/ncaab-the-dagger/tyler-kalinoski-s-buzzer-beater-caps-davidson-s-comeback-win-185609842.html?guid=nbc_cbk_davidsonbuzzerbeater_150313',
+            'info_dict': {
+                'id': '9CsDKds0kvHI',
+                'ext': 'flv',
+                'description': 'md5:df390f70a9ba7c95ff1daace988f0d8d',
+                'title': 'Tyler Kalinoski hits buzzer-beater to lift Davidson',
+            }
         }
     ]
 
@@ -151,6 +162,10 @@ class YahooIE(InfoExtractor):
                 items = json.loads(items_json)
                 video_id = items[0]['id']
                 return self._get_info(video_id, display_id, webpage)
+        # Look for NBCSports iframes
+        nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage)
+        if nbc_sports_url:
+            return self.url_result(nbc_sports_url, 'NBCSportsVPlayer')
 
         items_json = self._search_regex(
             r'mediaItems: ({.*?})$', webpage, 'items', flags=re.MULTILINE,
index e4c855ee0e3616981c77d41b441f15adf6ce0453..6abe72f739b63d8b39d8cdfc5bfccf70dc545715 100644 (file)
@@ -52,7 +52,7 @@ class YouPornIE(InfoExtractor):
             webpage, 'JSON parameters')
         try:
             params = json.loads(json_params)
-        except:
+        except ValueError:
             raise ExtractorError('Invalid JSON')
 
         self.report_extraction(video_id)
index 27c8c4453773974f52d02ecba58f092221c6e118..5488101e1997d1078c44a73ef0af5487aa3dcea9 100644 (file)
@@ -1263,27 +1263,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
 
         return self.playlist_result(url_results, playlist_id, title)
 
-    def _real_extract(self, url):
-        # Extract playlist id
-        mobj = re.match(self._VALID_URL, url)
-        if mobj is None:
-            raise ExtractorError('Invalid URL: %s' % url)
-        playlist_id = mobj.group(1) or mobj.group(2)
-
-        # Check if it's a video-specific URL
-        query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
-        if 'v' in query_dict:
-            video_id = query_dict['v'][0]
-            if self._downloader.params.get('noplaylist'):
-                self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
-                return self.url_result(video_id, 'Youtube', video_id=video_id)
-            else:
-                self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
-
-        if playlist_id.startswith('RD') or playlist_id.startswith('UL'):
-            # Mixes require a custom extraction process
-            return self._extract_mix(playlist_id)
-
+    def _extract_playlist(self, playlist_id):
         url = self._TEMPLATE_URL % playlist_id
         page = self._download_webpage(url, playlist_id)
         more_widget_html = content_html = page
@@ -1327,6 +1307,29 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
         url_results = self._ids_to_results(ids)
         return self.playlist_result(url_results, playlist_id, playlist_title)
 
+    def _real_extract(self, url):
+        # Extract playlist id
+        mobj = re.match(self._VALID_URL, url)
+        if mobj is None:
+            raise ExtractorError('Invalid URL: %s' % url)
+        playlist_id = mobj.group(1) or mobj.group(2)
+
+        # Check if it's a video-specific URL
+        query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+        if 'v' in query_dict:
+            video_id = query_dict['v'][0]
+            if self._downloader.params.get('noplaylist'):
+                self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
+                return self.url_result(video_id, 'Youtube', video_id=video_id)
+            else:
+                self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
+
+        if playlist_id.startswith('RD') or playlist_id.startswith('UL'):
+            # Mixes require a custom extraction process
+            return self._extract_mix(playlist_id)
+
+        return self._extract_playlist(playlist_id)
+
 
 class YoutubeChannelIE(InfoExtractor):
     IE_DESC = 'YouTube.com channels'
@@ -1643,21 +1646,26 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
 
 
 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
+    IE_NAME = 'youtube:recommended'
     IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
     _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
     _FEED_NAME = 'recommended'
     _PLAYLIST_TITLE = 'Youtube Recommended videos'
 
 
-class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
+class YoutubeWatchLaterIE(YoutubePlaylistIE):
+    IE_NAME = 'youtube:watchlater'
     IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
-    _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
-    _FEED_NAME = 'watch_later'
-    _PLAYLIST_TITLE = 'Youtube Watch Later'
-    _PERSONAL_FEED = True
+    _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater'
+
+    _TESTS = []  # override PlaylistIE tests
+
+    def _real_extract(self, url):
+        return self._extract_playlist('WL')
 
 
 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
+    IE_NAME = 'youtube:history'
     IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
     _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
     _FEED_NAME = 'history'
index b6f51cfd5e1ed5cebb4981dfbf3152e67ed33d1d..55adf9685b3e1407d6cb1fe1e1205e290203fa95 100644 (file)
@@ -3,7 +3,6 @@ from __future__ import unicode_literals
 import io
 import os
 import subprocess
-import sys
 import time
 
 
@@ -269,19 +268,17 @@ class FFmpegExtractAudioPP(FFmpegPostProcessor):
             else:
                 self._downloader.to_screen('[' + self.basename + '] Destination: ' + new_path)
                 self.run_ffmpeg(path, new_path, acodec, more_opts)
-        except:
-            etype, e, tb = sys.exc_info()
-            if isinstance(e, AudioConversionError):
-                msg = 'audio conversion failed: ' + e.msg
-            else:
-                msg = 'error running ' + self.basename
-            raise PostProcessingError(msg)
+        except AudioConversionError as e:
+            raise PostProcessingError(
+                'audio conversion failed: ' + e.msg)
+        except Exception:
+            raise PostProcessingError('error running ' + self.basename)
 
         # Try to update the date time for extracted audio file.
         if information.get('filetime') is not None:
             try:
                 os.utime(encodeFilename(new_path), (time.time(), information['filetime']))
-            except:
+            except Exception:
                 self._downloader.report_warning('Cannot update utime of audio file')
 
         information['filepath'] = new_path
index d8be4049f5dce0fdd9a61f2aff3c4284d494e598..de3169eef1d6ec29d82a60b2f4b6a68f49d7dd4e 100644 (file)
@@ -65,7 +65,7 @@ def update_self(to_screen, verbose):
     # Check if there is a new version
     try:
         newversion = opener.open(VERSION_URL).read().decode('utf-8').strip()
-    except:
+    except Exception:
         if verbose:
             to_screen(compat_str(traceback.format_exc()))
         to_screen('ERROR: can\'t find the current version. Please try again later.')
@@ -78,7 +78,7 @@ def update_self(to_screen, verbose):
     try:
         versions_info = opener.open(JSON_URL).read().decode('utf-8')
         versions_info = json.loads(versions_info)
-    except:
+    except Exception:
         if verbose:
             to_screen(compat_str(traceback.format_exc()))
         to_screen('ERROR: can\'t obtain versions info. Please try again later.')
index 472d4df41fda2cb1ffd0392cc4da0f4bdcc2a48a..90e0ed9ab7eb36d4c9f30efffdd2868e9cfe6d47 100644 (file)
@@ -75,7 +75,7 @@ def preferredencoding():
     try:
         pref = locale.getpreferredencoding()
         'TEST'.encode(pref)
-    except:
+    except Exception:
         pref = 'UTF-8'
 
     return pref
@@ -127,7 +127,7 @@ def write_json_file(obj, fn):
             except OSError:
                 pass
         os.rename(tf.name, fn)
-    except:
+    except Exception:
         try:
             os.remove(tf.name)
         except OSError:
@@ -348,7 +348,7 @@ def _htmlentity_transform(entity):
     if entity in compat_html_entities.name2codepoint:
         return compat_chr(compat_html_entities.name2codepoint[entity])
 
-    mobj = re.match(r'#(x?[0-9]+)', entity)
+    mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
     if mobj is not None:
         numstr = mobj.group(1)
         if numstr.startswith('x'):
index 039ceadf2ee938604b7832fb919915f520eb63da..dd93e295ac574da20839c9f0ac584bb90a101b00 100644 (file)
@@ -1,3 +1,3 @@
 from __future__ import unicode_literals
 
-__version__ = '2015.03.24'
+__version__ = '2015.03.28'