Merge branch 'master' of https://github.com/zx8/youtube-dl into zx8-master
authorSergey M․ <dstftw@gmail.com>
Thu, 26 Mar 2015 17:57:13 +0000 (23:57 +0600)
committerSergey M․ <dstftw@gmail.com>
Thu, 26 Mar 2015 17:57:13 +0000 (23:57 +0600)
16 files changed:
AUTHORS
docs/supportedsites.md
test/test_execution.py
test/test_utils.py
youtube_dl/extractor/__init__.py
youtube_dl/extractor/mlb.py
youtube_dl/extractor/npo.py
youtube_dl/extractor/redtube.py
youtube_dl/extractor/slideshare.py
youtube_dl/extractor/teamcoco.py
youtube_dl/extractor/twentytwotracks.py [new file with mode: 0644]
youtube_dl/extractor/vessel.py [new file with mode: 0644]
youtube_dl/extractor/youtube.py
youtube_dl/options.py
youtube_dl/utils.py
youtube_dl/version.py

diff --git a/AUTHORS b/AUTHORS
index 512469f4c9c9e50ce945372ca1dd5e704efc177b..59f1b5f21345b95622d4c21af7354f355dc782f8 100644 (file)
--- a/AUTHORS
+++ b/AUTHORS
@@ -117,3 +117,4 @@ Alexander Mamay
 Devin J. Pohly
 Eduardo Ferro Aldama
 Jeff Buchbinder
+Amish Bhadeshia
index 72b365305a1b4ec40eb7d0043750d39a16ad66d7..baf7b3880fd6180f19cdf44f04a9d38d2e0d4062 100644 (file)
  - **Letv**
  - **LetvPlaylist**
  - **LetvTv**
+ - **Libsyn**
  - **lifenews**: LIFE | NEWS
  - **LiveLeak**
  - **livestream**
  - **npo.nl:radio**
  - **npo.nl:radio:fragment**
  - **NRK**
+ - **NRKPlaylist**
  - **NRKTV**
  - **ntv.ru**
  - **Nuvid**
index 60df187de4921dfa7df808302f55f1ccd66bcb13..f31e51558abe7990a520c93b9e834753e402ca59 100644 (file)
@@ -1,4 +1,6 @@
 #!/usr/bin/env python
+# coding: utf-8
+
 from __future__ import unicode_literals
 
 import unittest
@@ -27,5 +29,12 @@ class TestExecution(unittest.TestCase):
     def test_main_exec(self):
         subprocess.check_call([sys.executable, 'youtube_dl/__main__.py', '--version'], cwd=rootDir, stdout=_DEV_NULL)
 
+    def test_cmdline_umlauts(self):
+        p = subprocess.Popen(
+            [sys.executable, 'youtube_dl/__main__.py', 'ä', '--version'],
+            cwd=rootDir, stdout=_DEV_NULL, stderr=subprocess.PIPE)
+        _, stderr = p.communicate()
+        self.assertFalse(stderr)
+
 if __name__ == '__main__':
     unittest.main()
index a8ab876850d3645eb615d552162128befdac316d..abaf1ab7331eab044c684b592bd5bfb85f4d39e0 100644 (file)
@@ -200,6 +200,8 @@ class TestUtil(unittest.TestCase):
 
     def test_unescape_html(self):
         self.assertEqual(unescapeHTML('%20;'), '%20;')
+        self.assertEqual(unescapeHTML('&#x2F;'), '/')
+        self.assertEqual(unescapeHTML('&#47;'), '/')
         self.assertEqual(
             unescapeHTML('&eacute;'), 'é')
 
index 3a0c42deda3f5e97412dd75833a0fb427c769078..d56eb64488fa5c1fc6c118709e4a6a797c78660d 100644 (file)
@@ -530,6 +530,10 @@ from .tvp import TvpIE, TvpSeriesIE
 from .tvplay import TVPlayIE
 from .tweakers import TweakersIE
 from .twentyfourvideo import TwentyFourVideoIE
+from .twentytwotracks import (
+    TwentyTwoTracksIE,
+    TwentyTwoTracksGenreIE
+)
 from .twitch import (
     TwitchVideoIE,
     TwitchChapterIE,
@@ -551,6 +555,7 @@ from .ustream import UstreamIE, UstreamChannelIE
 from .vbox7 import Vbox7IE
 from .veehd import VeeHDIE
 from .veoh import VeohIE
+from .vessel import VesselIE
 from .vesti import VestiIE
 from .vevo import VevoIE
 from .vgtv import VGTVIE
index 1a241aca77983ac9626a53e59bf85ad4394cc8fd..e369551c2fb5730377d863cf3bc2bdde31eb5f60 100644 (file)
@@ -10,7 +10,7 @@ from ..utils import (
 
 
 class MLBIE(InfoExtractor):
-    _VALID_URL = r'https?://m(?:lb)?\.mlb\.com/(?:(?:.*?/)?video/(?:topic/[\da-z_-]+/)?v|(?:shared/video/embed/embed\.html|[^/]+/video/play\.jsp)\?.*?\bcontent_id=)(?P<id>n?\d+)'
+    _VALID_URL = r'https?://m(?:lb)?\.(?:[\da-z_-]+\.)?mlb\.com/(?:(?:.*?/)?video/(?:topic/[\da-z_-]+/)?v|(?:shared/video/embed/embed\.html|[^/]+/video/play\.jsp)\?.*?\bcontent_id=)(?P<id>n?\d+)'
     _TESTS = [
         {
             'url': 'http://m.mlb.com/sea/video/topic/51231442/v34698933/nymsea-ackley-robs-a-home-run-with-an-amazing-catch/?c_id=sea',
@@ -80,6 +80,10 @@ class MLBIE(InfoExtractor):
             'url': 'http://mlb.mlb.com/es/video/play.jsp?content_id=36599553',
             'only_matching': True,
         },
+        {
+            'url': 'http://m.cardinals.mlb.com/stl/video/v51175783/atlstl-piscotty-makes-great-sliding-catch-on-line/?partnerId=as_mlb_20150321_42500876&adbid=579409712979910656&adbpl=tw&adbpr=52847728',
+            'only_matching': True,
+        }
     ]
 
     def _real_extract(self, url):
index 557dffa46846ff8a4c94f6bb102a186fa1ce5eb8..5d84485714b9f360d47c8676710e9c3e6d9578c7 100644 (file)
@@ -231,7 +231,10 @@ class NPOLiveIE(NPOBaseIE):
                 stream_url = self._download_json(
                     stream_info['stream'], display_id,
                     'Downloading %s URL' % stream_type,
-                    transform_source=strip_jsonp)
+                    'Unable to download %s URL' % stream_type,
+                    transform_source=strip_jsonp, fatal=False)
+                if not stream_url:
+                    continue
                 if stream_type == 'hds':
                     f4m_formats = self._extract_f4m_formats(stream_url, display_id)
                     # f4m downloader downloads only piece of live stream
index 846b76c81528431c0faf8ea3fc9bbd6b017db099..d6054d7175fd49a22117dd357bea7905f6e739be 100644 (file)
@@ -1,17 +1,19 @@
 from __future__ import unicode_literals
 
 from .common import InfoExtractor
+from ..utils import ExtractorError
 
 
 class RedTubeIE(InfoExtractor):
     _VALID_URL = r'http://(?:www\.)?redtube\.com/(?P<id>[0-9]+)'
     _TEST = {
         'url': 'http://www.redtube.com/66418',
+        'md5': '7b8c22b5e7098a3e1c09709df1126d2d',
         'info_dict': {
             'id': '66418',
             'ext': 'mp4',
-            "title": "Sucked on a toilet",
-            "age_limit": 18,
+            'title': 'Sucked on a toilet',
+            'age_limit': 18,
         }
     }
 
@@ -19,6 +21,9 @@ class RedTubeIE(InfoExtractor):
         video_id = self._match_id(url)
         webpage = self._download_webpage(url, video_id)
 
+        if any(s in webpage for s in ['video-deleted-info', '>This video has been removed']):
+            raise ExtractorError('Video %s has been removed' % video_id, expected=True)
+
         video_url = self._html_search_regex(
             r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL')
         video_title = self._html_search_regex(
index 9f79ff5c1b66d2bf37369a6009a914043493b407..0b717a1e42b8dd2c3d8a88d602f001876cf99e03 100644 (file)
@@ -30,7 +30,7 @@ class SlideshareIE(InfoExtractor):
         page_title = mobj.group('title')
         webpage = self._download_webpage(url, page_title)
         slideshare_obj = self._search_regex(
-            r'var\s+slideshare_object\s*=\s*({.*?});\s*var\s+user_info\s*=',
+            r'\$\.extend\(slideshare_object,\s*(\{.*?\})\);',
             webpage, 'slideshare object')
         info = json.loads(slideshare_obj)
         if info['slideshow']['type'] != 'video':
index 7cb06f351e5b388142b00b51aeba69a1ecfef250..a46a7ecba299c2e4e4e30eb69e7a8cfb3214155d 100644 (file)
@@ -54,7 +54,7 @@ class TeamcocoIE(InfoExtractor):
             embed_url, video_id, 'Downloading embed page')
 
         player_data = self._parse_json(self._search_regex(
-            r'Y\.Ginger\.Module\.Player\((\{.*?\})\);', embed, 'player data'), video_id)
+            r'Y\.Ginger\.Module\.Player(?:;var\s*player\s*=\s*new\s*m)?\((\{.*?\})\);', embed, 'player data'), video_id)
         data = self._parse_json(
             base64.b64decode(player_data['preload'].encode('ascii')).decode('utf-8'), video_id)
 
diff --git a/youtube_dl/extractor/twentytwotracks.py b/youtube_dl/extractor/twentytwotracks.py
new file mode 100644 (file)
index 0000000..d6c0ab1
--- /dev/null
@@ -0,0 +1,86 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+# 22Tracks regularly replace the audio tracks that can be streamed on their
+# site. The tracks usually expire after 1 months, so we can't add tests.
+
+
+class TwentyTwoTracksIE(InfoExtractor):
+    _VALID_URL = r'https?://22tracks\.com/(?P<city>[a-z]+)/(?P<genre>[\da-z]+)/(?P<id>\d+)'
+    IE_NAME = '22tracks:track'
+
+    _API_BASE = 'http://22tracks.com/api'
+
+    def _extract_info(self, city, genre_name, track_id=None):
+        item_id = track_id if track_id else genre_name
+
+        cities = self._download_json(
+            '%s/cities' % self._API_BASE, item_id,
+            'Downloading cities info',
+            'Unable to download cities info')
+        city_id = [x['id'] for x in cities if x['slug'] == city][0]
+
+        genres = self._download_json(
+            '%s/genres/%s' % (self._API_BASE, city_id), item_id,
+            'Downloading %s genres info' % city,
+            'Unable to download %s genres info' % city)
+        genre = [x for x in genres if x['slug'] == genre_name][0]
+        genre_id = genre['id']
+
+        tracks = self._download_json(
+            '%s/tracks/%s' % (self._API_BASE, genre_id), item_id,
+            'Downloading %s genre tracks info' % genre_name,
+            'Unable to download track info')
+
+        return [x for x in tracks if x['id'] == item_id][0] if track_id else [genre['title'], tracks]
+
+    def _get_track_url(self, filename, track_id):
+        token = self._download_json(
+            'http://22tracks.com/token.php?desktop=true&u=/128/%s' % filename,
+            track_id, 'Downloading token', 'Unable to download token')
+        return 'http://audio.22tracks.com%s?st=%s&e=%d' % (token['filename'], token['st'], token['e'])
+
+    def _extract_track_info(self, track_info, track_id):
+        download_url = self._get_track_url(track_info['filename'], track_id)
+        title = '%s - %s' % (track_info['artist'].strip(), track_info['title'].strip())
+        return {
+            'id': track_id,
+            'url': download_url,
+            'ext': 'mp3',
+            'title': title,
+            'duration': int_or_none(track_info.get('duration')),
+            'timestamp': int_or_none(track_info.get('published_at') or track_info.get('created'))
+        }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+
+        city = mobj.group('city')
+        genre = mobj.group('genre')
+        track_id = mobj.group('id')
+
+        track_info = self._extract_info(city, genre, track_id)
+        return self._extract_track_info(track_info, track_id)
+
+
+class TwentyTwoTracksGenreIE(TwentyTwoTracksIE):
+    _VALID_URL = r'https?://22tracks\.com/(?P<city>[a-z]+)/(?P<genre>[\da-z]+)/?$'
+    IE_NAME = '22tracks:genre'
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+
+        city = mobj.group('city')
+        genre = mobj.group('genre')
+
+        genre_title, tracks = self._extract_info(city, genre)
+
+        entries = [
+            self._extract_track_info(track_info, track_info['id'])
+            for track_info in tracks]
+
+        return self.playlist_result(entries, genre, genre_title)
diff --git a/youtube_dl/extractor/vessel.py b/youtube_dl/extractor/vessel.py
new file mode 100644 (file)
index 0000000..123d947
--- /dev/null
@@ -0,0 +1,127 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import json
+
+from .common import InfoExtractor
+from ..compat import compat_urllib_request
+from ..utils import (
+    ExtractorError,
+    parse_iso8601,
+)
+
+
+class VesselIE(InfoExtractor):
+    _VALID_URL = r'https?://(?:www\.)?vessel\.com/videos/(?P<id>[0-9a-zA-Z]+)'
+    _API_URL_TEMPLATE = 'https://www.vessel.com/api/view/items/%s'
+    _LOGIN_URL = 'https://www.vessel.com/api/account/login'
+    _NETRC_MACHINE = 'vessel'
+    _TEST = {
+        'url': 'https://www.vessel.com/videos/HDN7G5UMs',
+        'md5': '455cdf8beb71c6dd797fd2f3818d05c4',
+        'info_dict': {
+            'id': 'HDN7G5UMs',
+            'ext': 'mp4',
+            'title': 'Nvidia GeForce GTX Titan X - The Best Video Card on the Market?',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'upload_date': '20150317',
+            'description': 'Did Nvidia pull out all the stops on the Titan X, or does its performance leave something to be desired?',
+            'timestamp': int,
+        },
+    }
+
+    @staticmethod
+    def make_json_request(url, data):
+        payload = json.dumps(data).encode('utf-8')
+        req = compat_urllib_request.Request(url, payload)
+        req.add_header('Content-Type', 'application/json; charset=utf-8')
+        return req
+
+    @staticmethod
+    def find_assets(data, asset_type):
+        for asset in data.get('assets', []):
+            if asset.get('type') == asset_type:
+                yield asset
+
+    def _check_access_rights(self, data):
+        access_info = data.get('__view', {})
+        if access_info.get('allow_access') == False:
+            err_code = access_info.get('error_code') or ''
+            if err_code == 'ITEM_PAID_ONLY':
+                raise ExtractorError(
+                    'This video requires subscription.', expected=True)
+            else:
+                raise ExtractorError(
+                    'Access to this content is restricted. (%s said: %s)' % (self.IE_NAME, err_code), expected=True)
+
+    def _login(self):
+        (username, password) = self._get_login_info()
+        if username is None:
+            return
+        self.report_login()
+        data = {
+            'client_id': 'web',
+            'type': 'password',
+            'user_key': username,
+            'password': password,
+        }
+        login_request = VesselIE.make_json_request(self._LOGIN_URL, data)
+        self._download_webpage(login_request, None, False, 'Wrong login info')
+
+    def _real_initialize(self):
+        self._login()
+
+    def _real_extract(self, url):
+        video_id = self._match_id(url)
+
+        webpage = self._download_webpage(url, video_id)
+        data = self._parse_json(self._search_regex(
+            r'App\.bootstrapData\((.*?)\);', webpage, 'data'), video_id)
+        asset_id = data['model']['data']['id']
+
+        req = VesselIE.make_json_request(
+            self._API_URL_TEMPLATE % asset_id, {'client': 'web'})
+        data = self._download_json(req, video_id)
+
+        self._check_access_rights(data)
+
+        try:
+            video_asset = next(VesselIE.find_assets(data, 'video'))
+        except StopIteration:
+            raise ExtractorError('No video assets found')
+
+        formats = []
+        for f in video_asset.get('sources', []):
+            if f['name'] == 'hls-index':
+                formats.extend(self._extract_m3u8_formats(
+                    f['location'], video_id, ext='mp4', m3u8_id='m3u8'))
+            else:
+                formats.append({
+                    'format_id': f['name'],
+                    'tbr': f.get('bitrate'),
+                    'height': f.get('height'),
+                    'width': f.get('width'),
+                    'url': f['location'],
+                })
+        self._sort_formats(formats)
+
+        thumbnails = []
+        for im_asset in VesselIE.find_assets(data, 'image'):
+            thumbnails.append({
+                'url': im_asset['location'],
+                'width': im_asset.get('width', 0),
+                'height': im_asset.get('height', 0),
+            })
+
+        return {
+            'id': video_id,
+            'title': data['title'],
+            'formats': formats,
+            'thumbnails': thumbnails,
+            'description': data.get('short_description'),
+            'duration': data.get('duration'),
+            'comment_count': data.get('comment_count'),
+            'like_count': data.get('like_count'),
+            'view_count': data.get('view_count'),
+            'timestamp': parse_iso8601(data.get('released_at')),
+        }
index 27c8c4453773974f52d02ecba58f092221c6e118..eba699c3ace0ebdb379a457e2a6ceaff8f1f2f41 100644 (file)
@@ -1263,27 +1263,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
 
         return self.playlist_result(url_results, playlist_id, title)
 
-    def _real_extract(self, url):
-        # Extract playlist id
-        mobj = re.match(self._VALID_URL, url)
-        if mobj is None:
-            raise ExtractorError('Invalid URL: %s' % url)
-        playlist_id = mobj.group(1) or mobj.group(2)
-
-        # Check if it's a video-specific URL
-        query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
-        if 'v' in query_dict:
-            video_id = query_dict['v'][0]
-            if self._downloader.params.get('noplaylist'):
-                self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
-                return self.url_result(video_id, 'Youtube', video_id=video_id)
-            else:
-                self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
-
-        if playlist_id.startswith('RD') or playlist_id.startswith('UL'):
-            # Mixes require a custom extraction process
-            return self._extract_mix(playlist_id)
-
+    def _extract_playlist(self, playlist_id):
         url = self._TEMPLATE_URL % playlist_id
         page = self._download_webpage(url, playlist_id)
         more_widget_html = content_html = page
@@ -1327,6 +1307,29 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
         url_results = self._ids_to_results(ids)
         return self.playlist_result(url_results, playlist_id, playlist_title)
 
+    def _real_extract(self, url):
+        # Extract playlist id
+        mobj = re.match(self._VALID_URL, url)
+        if mobj is None:
+            raise ExtractorError('Invalid URL: %s' % url)
+        playlist_id = mobj.group(1) or mobj.group(2)
+
+        # Check if it's a video-specific URL
+        query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
+        if 'v' in query_dict:
+            video_id = query_dict['v'][0]
+            if self._downloader.params.get('noplaylist'):
+                self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
+                return self.url_result(video_id, 'Youtube', video_id=video_id)
+            else:
+                self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
+
+        if playlist_id.startswith('RD') or playlist_id.startswith('UL'):
+            # Mixes require a custom extraction process
+            return self._extract_mix(playlist_id)
+
+        return self._extract_playlist(playlist_id)
+
 
 class YoutubeChannelIE(InfoExtractor):
     IE_DESC = 'YouTube.com channels'
@@ -1643,21 +1646,27 @@ class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
 
 
 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
+    IE_NAME = 'youtube:recommended'
     IE_DESC = 'YouTube.com recommended videos, ":ytrec" for short (requires authentication)'
     _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
     _FEED_NAME = 'recommended'
     _PLAYLIST_TITLE = 'Youtube Recommended videos'
 
 
-class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
+class YoutubeWatchLaterIE(YoutubePlaylistIE):
+    IE_NAME = 'youtube:watchlater'
     IE_DESC = 'Youtube watch later list, ":ytwatchlater" for short (requires authentication)'
-    _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
+    _VALID_URL = r'https?://www\.youtube\.com/(?:feed/watch_later|playlist\?list=WL)|:ytwatchlater'
     _FEED_NAME = 'watch_later'
     _PLAYLIST_TITLE = 'Youtube Watch Later'
     _PERSONAL_FEED = True
 
+    def _real_extract(self, url):
+        return self._extract_playlist('WL')
+
 
 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
+    IE_NAME = 'youtube:history'
     IE_DESC = 'Youtube watch history, ":ythistory" for short (requires authentication)'
     _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
     _FEED_NAME = 'history'
index 4e6e47d6fdc430f4071c1470f25a1b7377ad7845..35c7e5fb3507f5e846e166c5f6c15f6cf637498a 100644 (file)
@@ -794,6 +794,11 @@ def parseOpts(overrideArguments=None):
             write_string('[debug] Override config: ' + repr(overrideArguments) + '\n')
     else:
         command_line_conf = sys.argv[1:]
+        # Workaround for Python 2.x, where argv is a byte list
+        if sys.version_info < (3,):
+            command_line_conf = [
+                a.decode('utf-8', 'replace') for a in command_line_conf]
+
         if '--ignore-config' in command_line_conf:
             system_conf = []
             user_conf = []
index 472d4df41fda2cb1ffd0392cc4da0f4bdcc2a48a..245d623d86df365a562fb395bc792b42f68634eb 100644 (file)
@@ -348,7 +348,7 @@ def _htmlentity_transform(entity):
     if entity in compat_html_entities.name2codepoint:
         return compat_chr(compat_html_entities.name2codepoint[entity])
 
-    mobj = re.match(r'#(x?[0-9]+)', entity)
+    mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity)
     if mobj is not None:
         numstr = mobj.group(1)
         if numstr.startswith('x'):
index 51b4260aad38f9dbae3b8d2e0d42602c540d3e4f..039ceadf2ee938604b7832fb919915f520eb63da 100644 (file)
@@ -1,3 +1,3 @@
 from __future__ import unicode_literals
 
-__version__ = '2015.03.18'
+__version__ = '2015.03.24'