Merge pull request #7208 from yan12125/letv-fix
authorYen Chi Hsuan <yan12125@gmail.com>
Sun, 18 Oct 2015 14:32:10 +0000 (22:32 +0800)
committerYen Chi Hsuan <yan12125@gmail.com>
Sun, 18 Oct 2015 14:32:10 +0000 (22:32 +0800)
[Letv] Fix extraction

18 files changed:
test/test_InfoExtractor.py
test/test_youtube_lists.py
youtube_dl/extractor/__init__.py
youtube_dl/extractor/canalc2.py
youtube_dl/extractor/common.py
youtube_dl/extractor/crunchyroll.py
youtube_dl/extractor/dailymotion.py
youtube_dl/extractor/eagleplatform.py
youtube_dl/extractor/imdb.py
youtube_dl/extractor/lynda.py
youtube_dl/extractor/rte.py
youtube_dl/extractor/twitch.py
youtube_dl/extractor/twitter.py
youtube_dl/extractor/vidme.py
youtube_dl/extractor/viewster.py
youtube_dl/extractor/vimeo.py
youtube_dl/extractor/vine.py
youtube_dl/extractor/youtube.py

index 2a00d09a5a1f6e666d4e4fdd4c837495045a24b4..938466a800122211ab0414d9aa9de831951e2903 100644 (file)
@@ -37,12 +37,16 @@ class TestInfoExtractor(unittest.TestCase):
             <meta property='og:image' content='http://domain.com/pic.jpg?key1=val1&amp;key2=val2'/>
             <meta content='application/x-shockwave-flash' property='og:video:type'>
             <meta content='Foo' property=og:foobar>
+            <meta name="og:test1" content='foo > < bar'/>
+            <meta name="og:test2" content="foo >//< bar"/>
             '''
         self.assertEqual(ie._og_search_title(html), 'Foo')
         self.assertEqual(ie._og_search_description(html), 'Some video\'s description ')
         self.assertEqual(ie._og_search_thumbnail(html), 'http://domain.com/pic.jpg?key1=val1&key2=val2')
         self.assertEqual(ie._og_search_video_url(html, default=None), None)
         self.assertEqual(ie._og_search_property('foobar', html), 'Foo')
+        self.assertEqual(ie._og_search_property('test1', html), 'foo > < bar')
+        self.assertEqual(ie._og_search_property('test2', html), 'foo >//< bar')
 
     def test_html_search_meta(self):
         ie = self.ie
index c889b6f15c40f5ea91a4dd4ea5a86bb8c62c830d..26aadb34f00e6961c2521ceaa8be1a51fe359a8d 100644 (file)
@@ -57,5 +57,14 @@ class TestYoutubeLists(unittest.TestCase):
         entries = result['entries']
         self.assertEqual(len(entries), 100)
 
+    def test_youtube_flat_playlist_titles(self):
+        dl = FakeYDL()
+        dl.params['extract_flat'] = True
+        ie = YoutubePlaylistIE(dl)
+        result = ie.extract('https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re')
+        self.assertIsPlaylist(result)
+        for entry in result['entries']:
+            self.assertTrue(entry.get('title'))
+
 if __name__ == '__main__':
     unittest.main()
index 462717b1e0aa6363d52dd5660ae6b4fc63eca559..bd6eb6ae04de7536b02193d64364daae3beafcf2 100644 (file)
@@ -690,7 +690,7 @@ from .twitch import (
     TwitchBookmarksIE,
     TwitchStreamIE,
 )
-from .twitter import TwitterCardIE
+from .twitter import TwitterCardIE, TwitterIE
 from .ubu import UbuIE
 from .udemy import (
     UdemyIE,
index c4fefefe43b250c13c3a711cf397e1a3caa046a7..f6a1ff381a61af338fd7dec2419aaadfa1081c8b 100644 (file)
@@ -4,38 +4,53 @@ from __future__ import unicode_literals
 import re
 
 from .common import InfoExtractor
+from ..utils import parse_duration
 
 
 class Canalc2IE(InfoExtractor):
     IE_NAME = 'canalc2.tv'
-    _VALID_URL = r'http://.*?\.canalc2\.tv/video\.asp\?.*?idVideo=(?P<id>\d+)'
+    _VALID_URL = r'https?://(?:www\.)?canalc2\.tv/video/(?P<id>\d+)'
 
     _TEST = {
-        'url': 'http://www.canalc2.tv/video.asp?idVideo=12163&voir=oui',
+        'url': 'http://www.canalc2.tv/video/12163',
         'md5': '060158428b650f896c542dfbb3d6487f',
         'info_dict': {
             'id': '12163',
-            'ext': 'mp4',
-            'title': 'Terrasses du Numérique'
+            'ext': 'flv',
+            'title': 'Terrasses du Numérique',
+            'duration': 122,
+        },
+        'params': {
+            'skip_download': True,  # Requires rtmpdump
         }
     }
 
     def _real_extract(self, url):
-        video_id = re.match(self._VALID_URL, url).group('id')
-        # We need to set the voir field for getting the file name
-        url = 'http://www.canalc2.tv/video.asp?idVideo=%s&voir=oui' % video_id
+        video_id = self._match_id(url)
         webpage = self._download_webpage(url, video_id)
-        file_name = self._search_regex(
-            r"so\.addVariable\('file','(.*?)'\);",
-            webpage, 'file name')
-        video_url = 'http://vod-flash.u-strasbg.fr:8080/' + file_name
+        video_url = self._search_regex(
+            r'jwplayer\((["\'])Player\1\)\.setup\({[^}]*file\s*:\s*(["\'])(?P<file>.+?)\2',
+            webpage, 'video_url', group='file')
+        formats = [{'url': video_url}]
+        if video_url.startswith('rtmp://'):
+            rtmp = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>.+/))(?P<play_path>mp4:.+)$', video_url)
+            formats[0].update({
+                'url': rtmp.group('url'),
+                'ext': 'flv',
+                'app': rtmp.group('app'),
+                'play_path': rtmp.group('play_path'),
+                'page_url': url,
+            })
 
         title = self._html_search_regex(
-            r'class="evenement8">(.*?)</a>', webpage, 'title')
+            r'(?s)class="[^"]*col_description[^"]*">.*?<h3>(.*?)</h3>', webpage, 'title')
+        duration = parse_duration(self._search_regex(
+            r'id=["\']video_duree["\'][^>]*>([^<]+)',
+            webpage, 'duration', fatal=False))
 
         return {
             'id': video_id,
-            'ext': 'mp4',
-            'url': video_url,
             'title': title,
+            'duration': duration,
+            'formats': formats,
         }
index a0c4af92f2aa284801fcb59b459285dce6933336..6169fbbeb4c4ce8731f7226aebcc2e93bdc6dd84 100644 (file)
@@ -172,6 +172,7 @@ class InfoExtractor(object):
     view_count:     How many users have watched the video on the platform.
     like_count:     Number of positive ratings of the video
     dislike_count:  Number of negative ratings of the video
+    repost_count:   Number of reposts of the video
     average_rating: Average rating give by users, the scale used depends on the webpage
     comment_count:  Number of comments on the video
     comments:       A list of comments, each with one or more of the following
@@ -645,7 +646,7 @@ class InfoExtractor(object):
     # Helper functions for extracting OpenGraph info
     @staticmethod
     def _og_regexes(prop):
-        content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\'|\s*([^\s"\'=<>`]+?))'
+        content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
                        % {'prop': re.escape(prop)})
         template = r'<meta[^>]+?%s[^>]+?%s'
index 95952bc292c05548a88876b7b52cf236f8bdc826..cecd0c7843b9b42881637ad0617e5e351eb8f97e 100644 (file)
@@ -32,6 +32,26 @@ from ..aes import (
 
 
 class CrunchyrollBaseIE(InfoExtractor):
+    _NETRC_MACHINE = 'crunchyroll'
+
+    def _login(self):
+        (username, password) = self._get_login_info()
+        if username is None:
+            return
+        self.report_login()
+        login_url = 'https://www.crunchyroll.com/?a=formhandler'
+        data = urlencode_postdata({
+            'formname': 'RpcApiUser_Login',
+            'name': username,
+            'password': password,
+        })
+        login_request = compat_urllib_request.Request(login_url, data)
+        login_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
+        self._download_webpage(login_request, None, False, 'Wrong login info')
+
+    def _real_initialize(self):
+        self._login()
+
     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None):
         request = (url_or_request if isinstance(url_or_request, compat_urllib_request.Request)
                    else compat_urllib_request.Request(url_or_request))
@@ -46,10 +66,22 @@ class CrunchyrollBaseIE(InfoExtractor):
         return super(CrunchyrollBaseIE, self)._download_webpage(
             request, video_id, note, errnote, fatal, tries, timeout, encoding)
 
+    @staticmethod
+    def _add_skip_wall(url):
+        parsed_url = compat_urlparse.urlparse(url)
+        qs = compat_urlparse.parse_qs(parsed_url.query)
+        # Always force skip_wall to bypass maturity wall, namely 18+ confirmation message:
+        # > This content may be inappropriate for some people.
+        # > Are you sure you want to continue?
+        # since it's not disabled by default in crunchyroll account's settings.
+        # See https://github.com/rg3/youtube-dl/issues/7202.
+        qs['skip_wall'] = ['1']
+        return compat_urlparse.urlunparse(
+            parsed_url._replace(query=compat_urllib_parse.urlencode(qs, True)))
+
 
 class CrunchyrollIE(CrunchyrollBaseIE):
     _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.(?:com|fr)/(?:media(?:-|/\?id=)|[^/]*/[^/?&]*?)(?P<video_id>[0-9]+))(?:[/?&]|$)'
-    _NETRC_MACHINE = 'crunchyroll'
     _TESTS = [{
         'url': 'http://www.crunchyroll.com/wanna-be-the-strongest-in-the-world/episode-1-an-idol-wrestler-is-born-645513',
         'info_dict': {
@@ -81,10 +113,13 @@ class CrunchyrollIE(CrunchyrollBaseIE):
             # rtmp
             'skip_download': True,
         },
-
     }, {
         'url': 'http://www.crunchyroll.fr/girl-friend-beta/episode-11-goodbye-la-mode-661697',
         'only_matching': True,
+    }, {
+        # geo-restricted (US), 18+ maturity wall, non-premium available
+        'url': 'http://www.crunchyroll.com/cosplay-complex-ova/episode-1-the-birth-of-the-cosplay-club-565617',
+        'only_matching': True,
     }]
 
     _FORMAT_IDS = {
@@ -94,24 +129,6 @@ class CrunchyrollIE(CrunchyrollBaseIE):
         '1080': ('80', '108'),
     }
 
-    def _login(self):
-        (username, password) = self._get_login_info()
-        if username is None:
-            return
-        self.report_login()
-        login_url = 'https://www.crunchyroll.com/?a=formhandler'
-        data = urlencode_postdata({
-            'formname': 'RpcApiUser_Login',
-            'name': username,
-            'password': password,
-        })
-        login_request = compat_urllib_request.Request(login_url, data)
-        login_request.add_header('Content-Type', 'application/x-www-form-urlencoded')
-        self._download_webpage(login_request, None, False, 'Wrong login info')
-
-    def _real_initialize(self):
-        self._login()
-
     def _decrypt_subtitles(self, data, iv, id):
         data = bytes_to_intlist(base64.b64decode(data.encode('utf-8')))
         iv = bytes_to_intlist(base64.b64decode(iv.encode('utf-8')))
@@ -254,7 +271,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
         else:
             webpage_url = 'http://www.' + mobj.group('url')
 
-        webpage = self._download_webpage(webpage_url, video_id, 'Downloading webpage')
+        webpage = self._download_webpage(self._add_skip_wall(webpage_url), video_id, 'Downloading webpage')
         note_m = self._html_search_regex(
             r'<div class="showmedia-trailer-notice">(.+?)</div>',
             webpage, 'trailer-notice', default='')
@@ -352,7 +369,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
 
 class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE):
     IE_NAME = "crunchyroll:playlist"
-    _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login))(?P<id>[\w\-]+))/?$'
+    _VALID_URL = r'https?://(?:(?P<prefix>www|m)\.)?(?P<url>crunchyroll\.com/(?!(?:news|anime-news|library|forum|launchcalendar|lineup|store|comics|freetrial|login))(?P<id>[\w\-]+))/?(?:\?|$)'
 
     _TESTS = [{
         'url': 'http://www.crunchyroll.com/a-bridge-to-the-starry-skies-hoshizora-e-kakaru-hashi',
@@ -361,12 +378,25 @@ class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE):
             'title': 'A Bridge to the Starry Skies - Hoshizora e Kakaru Hashi'
         },
         'playlist_count': 13,
+    }, {
+        # geo-restricted (US), 18+ maturity wall, non-premium available
+        'url': 'http://www.crunchyroll.com/cosplay-complex-ova',
+        'info_dict': {
+            'id': 'cosplay-complex-ova',
+            'title': 'Cosplay Complex OVA'
+        },
+        'playlist_count': 3,
+        'skip': 'Georestricted',
+    }, {
+        # geo-restricted (US), 18+ maturity wall, non-premium will be available since 2015.11.14
+        'url': 'http://www.crunchyroll.com/ladies-versus-butlers?skip_wall=1',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
         show_id = self._match_id(url)
 
-        webpage = self._download_webpage(url, show_id)
+        webpage = self._download_webpage(self._add_skip_wall(url), show_id)
         title = self._html_search_regex(
             r'(?s)<h1[^>]*>\s*<span itemprop="name">(.*?)</span>',
             webpage, 'title')
index 80a05cfee880f57a73b79275f15729adb14e6d24..9cd9ff17d2559a5648e72b1f425ba0aee5ed42b4 100644 (file)
@@ -96,6 +96,11 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
                 'uploader': 'HotWaves1012',
                 'age_limit': 18,
             }
+        },
+        # geo-restricted, player v5
+        {
+            'url': 'http://www.dailymotion.com/video/xhza0o',
+            'only_matching': True,
         }
     ]
 
@@ -124,6 +129,9 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
         if player_v5:
             player = self._parse_json(player_v5, video_id)
             metadata = player['metadata']
+
+            self._check_error(metadata)
+
             formats = []
             for quality, media_list in metadata['qualities'].items():
                 for media in media_list:
@@ -201,9 +209,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
                 'video info', flags=re.MULTILINE),
             video_id)
 
-        if info.get('error') is not None:
-            msg = 'Couldn\'t get video, Dailymotion says: %s' % info['error']['title']
-            raise ExtractorError(msg, expected=True)
+        self._check_error(info)
 
         formats = []
         for (key, format_id) in self._FORMATS:
@@ -246,6 +252,11 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
             'duration': info['duration']
         }
 
+    def _check_error(self, info):
+        if info.get('error') is not None:
+            raise ExtractorError(
+                '%s said: %s' % (self.IE_NAME, info['error']['title']), expected=True)
+
     def _get_subtitles(self, video_id, webpage):
         try:
             sub_list = self._download_webpage(
index e529b9b9600583aadb4bb3e21f7ae61cee0cd4b8..7bbf617d468f9d5295ee2e1123452679a21a7e4b 100644 (file)
@@ -87,7 +87,7 @@ class EaglePlatformIE(InfoExtractor):
         m3u8_url = self._get_video_url(secure_m3u8, video_id, 'Downloading m3u8 JSON')
         formats = self._extract_m3u8_formats(
             m3u8_url, video_id,
-            'mp4', entry_protocol='m3u8_native')
+            'mp4', entry_protocol='m3u8_native', m3u8_id='hls')
 
         mp4_url = self._get_video_url(
             # Secure mp4 URL is constructed according to Player.prototype.mp4 from
index 4bb574cf37df2421721b088ded37a3fc66c8c2ea..02e1e428e9e41ba75a2ff5c37c7cf0732682c111 100644 (file)
@@ -4,8 +4,8 @@ import re
 import json
 
 from .common import InfoExtractor
-from ..compat import (
-    compat_urlparse,
+from ..utils import (
+    qualities,
 )
 
 
@@ -30,24 +30,33 @@ class ImdbIE(InfoExtractor):
         descr = self._html_search_regex(
             r'(?s)<span itemprop="description">(.*?)</span>',
             webpage, 'description', fatal=False)
-        available_formats = re.findall(
-            r'case \'(?P<f_id>.*?)\' :$\s+url = \'(?P<path>.*?)\'', webpage,
-            flags=re.MULTILINE)
+        player_url = 'http://www.imdb.com/video/imdb/vi%s/imdb/single' % video_id
+        player_page = self._download_webpage(
+            player_url, video_id, 'Downloading player page')
+        # the player page contains the info for the default format, we have to
+        # fetch other pages for the rest of the formats
+        extra_formats = re.findall(r'href="(?P<url>%s.*?)".*?>(?P<name>.*?)<' % re.escape(player_url), player_page)
+        format_pages = [
+            self._download_webpage(
+                f_url, video_id, 'Downloading info for %s format' % f_name)
+            for f_url, f_name in extra_formats]
+        format_pages.append(player_page)
+
+        quality = qualities(['SD', '480p', '720p'])
         formats = []
-        for f_id, f_path in available_formats:
-            f_path = f_path.strip()
-            format_page = self._download_webpage(
-                compat_urlparse.urljoin(url, f_path),
-                'Downloading info for %s format' % f_id)
+        for format_page in format_pages:
             json_data = self._search_regex(
                 r'<script[^>]+class="imdb-player-data"[^>]*?>(.*?)</script>',
                 format_page, 'json data', flags=re.DOTALL)
             info = json.loads(json_data)
             format_info = info['videoPlayerObject']['video']
+            f_id = format_info['ffname']
             formats.append({
                 'format_id': f_id,
                 'url': format_info['videoInfoList'][0]['videoUrl'],
+                'quality': quality(f_id),
             })
+        self._sort_formats(formats)
 
         return {
             'id': video_id,
index 378117270439e7ce2669c46422f3df63caa33051..5c973e75ccf630c71b50dcc590e93185f3a5c5ea 100644 (file)
@@ -140,13 +140,14 @@ class LyndaIE(LyndaBaseIE):
 
         prioritized_streams = video_json.get('PrioritizedStreams')
         if prioritized_streams:
-            formats.extend([
-                {
-                    'url': video_url,
-                    'width': int_or_none(format_id),
-                    'format_id': format_id,
-                } for format_id, video_url in prioritized_streams['0'].items()
-            ])
+            for prioritized_stream_id, prioritized_stream in prioritized_streams.items():
+                formats.extend([
+                    {
+                        'url': video_url,
+                        'width': int_or_none(format_id),
+                        'format_id': '%s-%s' % (prioritized_stream_id, format_id),
+                    } for format_id, video_url in prioritized_stream.items()
+                ])
 
         self._check_formats(formats, video_id)
         self._sort_formats(formats)
index 427c70866d1015293c160f1041680174d9f6d13e..d9cfbf1808b712a359222dff60cd27ee1799785f 100644 (file)
@@ -9,16 +9,16 @@ from ..utils import (
 
 
 class RteIE(InfoExtractor):
-    _VALID_URL = r'http?://(?:www\.)?rte\.ie/player/[^/]{2,3}/show/[^/]+/(?P<id>[0-9]+)'
+    _VALID_URL = r'https?://(?:www\.)?rte\.ie/player/[^/]{2,3}/show/[^/]+/(?P<id>[0-9]+)'
     _TEST = {
-        'url': 'http://www.rte.ie/player/de/show/10363114/',
+        'url': 'http://www.rte.ie/player/ie/show/iwitness-862/10478715/',
         'info_dict': {
-            'id': '10363114',
+            'id': '10478715',
             'ext': 'mp4',
-            'title': 'One News',
+            'title': 'Watch iWitness  online',
             'thumbnail': 're:^https?://.*\.jpg$',
-            'description': 'The One O\'Clock News followed by Weather.',
-            'duration': 436.844,
+            'description': 'iWitness : The spirit of Ireland, one voice and one minute at a time.',
+            'duration': 60.046,
         },
         'params': {
             'skip_download': 'f4m fails with --test atm'
index 023911c41f4b05ef0a8137e9cf8fede2cc94cf6d..3ec08b67479396b35ade6150b7bc0d9ff6428df3 100644 (file)
@@ -15,6 +15,7 @@ from ..compat import (
     compat_urlparse,
 )
 from ..utils import (
+    encode_dict,
     ExtractorError,
     int_or_none,
     parse_duration,
@@ -27,8 +28,7 @@ class TwitchBaseIE(InfoExtractor):
 
     _API_BASE = 'https://api.twitch.tv'
     _USHER_BASE = 'http://usher.twitch.tv'
-    _LOGIN_URL = 'https://secure.twitch.tv/login'
-    _LOGIN_POST_URL = 'https://passport.twitch.tv/authentications/new'
+    _LOGIN_URL = 'http://www.twitch.tv/login'
     _NETRC_MACHINE = 'twitch'
 
     def _handle_error(self, response):
@@ -61,26 +61,28 @@ class TwitchBaseIE(InfoExtractor):
         if username is None:
             return
 
-        login_page = self._download_webpage(
+        login_page, handle = self._download_webpage_handle(
             self._LOGIN_URL, None, 'Downloading login page')
 
         login_form = self._hidden_inputs(login_page)
 
         login_form.update({
-            'login': username.encode('utf-8'),
-            'password': password.encode('utf-8'),
+            'username': username,
+            'password': password,
         })
 
+        redirect_url = handle.geturl()
+
         post_url = self._search_regex(
             r'<form[^>]+action=(["\'])(?P<url>.+?)\1', login_page,
-            'post url', default=self._LOGIN_POST_URL, group='url')
+            'post url', default=redirect_url, group='url')
 
         if not post_url.startswith('http'):
-            post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url)
+            post_url = compat_urlparse.urljoin(redirect_url, post_url)
 
         request = compat_urllib_request.Request(
-            post_url, compat_urllib_parse.urlencode(login_form).encode('utf-8'))
-        request.add_header('Referer', self._LOGIN_URL)
+            post_url, compat_urllib_parse.urlencode(encode_dict(login_form)).encode('utf-8'))
+        request.add_header('Referer', redirect_url)
         response = self._download_webpage(
             request, None, 'Logging in as %s' % username)
 
@@ -238,14 +240,24 @@ class TwitchVodIE(TwitchItemBaseIE):
 
     def _real_extract(self, url):
         item_id = self._match_id(url)
+
         info = self._download_info(self._ITEM_SHORTCUT, item_id)
         access_token = self._download_json(
             '%s/api/vods/%s/access_token' % (self._API_BASE, item_id), item_id,
             'Downloading %s access token' % self._ITEM_TYPE)
+
         formats = self._extract_m3u8_formats(
-            '%s/vod/%s?nauth=%s&nauthsig=%s&allow_source=true'
-            % (self._USHER_BASE, item_id, access_token['token'], access_token['sig']),
+            '%s/vod/%s?%s' % (
+                self._USHER_BASE, item_id,
+                compat_urllib_parse.urlencode({
+                    'allow_source': 'true',
+                    'allow_spectre': 'true',
+                    'player': 'twitchweb',
+                    'nauth': access_token['token'],
+                    'nauthsig': access_token['sig'],
+                })),
             item_id, 'mp4')
+
         self._prefer_source(formats)
         info['formats'] = formats
 
index 1aaa0630519fff52ed51f4964121552592ffbaa6..9d3e46b946843ae0da6b9de525c4aa4b8b3f4cbb 100644 (file)
@@ -1,3 +1,4 @@
+# coding: utf-8
 from __future__ import unicode_literals
 
 import re
@@ -6,23 +7,51 @@ from .common import InfoExtractor
 from ..compat import compat_urllib_request
 from ..utils import (
     float_or_none,
-    unescapeHTML,
+    xpath_text,
+    remove_end,
 )
 
 
 class TwitterCardIE(InfoExtractor):
+    IE_NAME = 'twitter:card'
     _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/cards/tfw/v1/(?P<id>\d+)'
-    _TEST = {
-        'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889',
-        'md5': 'a74f50b310c83170319ba16de6955192',
-        'info_dict': {
-            'id': '560070183650213889',
-            'ext': 'mp4',
-            'title': 'TwitterCard',
-            'thumbnail': 're:^https?://.*\.jpg$',
-            'duration': 30.033,
+    _TESTS = [
+        {
+            'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889',
+            'md5': '7d2f6b4d2eb841a7ccc893d479bfceb4',
+            'info_dict': {
+                'id': '560070183650213889',
+                'ext': 'mp4',
+                'title': 'TwitterCard',
+                'thumbnail': 're:^https?://.*\.jpg$',
+                'duration': 30.033,
+            }
         },
-    }
+        {
+            'url': 'https://twitter.com/i/cards/tfw/v1/623160978427936768',
+            'md5': '7ee2a553b63d1bccba97fbed97d9e1c8',
+            'info_dict': {
+                'id': '623160978427936768',
+                'ext': 'mp4',
+                'title': 'TwitterCard',
+                'thumbnail': 're:^https?://.*\.jpg',
+                'duration': 80.155,
+            },
+        },
+        {
+            'url': 'https://twitter.com/i/cards/tfw/v1/654001591733886977',
+            'md5': 'b6f35e8b08a0bec6c8af77a2f4b3a814',
+            'info_dict': {
+                'id': 'dq4Oj5quskI',
+                'ext': 'mp4',
+                'title': 'Ubuntu 11.10 Overview',
+                'description': 'Take a quick peek at what\'s new and improved in Ubuntu 11.10.\n\nOnce installed take a look at 10 Things to Do After Installing: http://www.omgubuntu.co.uk/2011/10/10-things-to-do-after-installing-ubuntu-11-10/',
+                'upload_date': '20111013',
+                'uploader': 'OMG! Ubuntu!',
+                'uploader_id': 'omgubuntu',
+            },
+        }
+    ]
 
     def _real_extract(self, url):
         video_id = self._match_id(url)
@@ -40,10 +69,24 @@ class TwitterCardIE(InfoExtractor):
             request.add_header('User-Agent', user_agent)
             webpage = self._download_webpage(request, video_id)
 
-            config = self._parse_json(
-                unescapeHTML(self._search_regex(
-                    r'data-player-config="([^"]+)"', webpage, 'data player config')),
+            youtube_url = self._html_search_regex(
+                r'<iframe[^>]+src="((?:https?:)?//www.youtube.com/embed/[^"]+)"',
+                webpage, 'youtube iframe', default=None)
+            if youtube_url:
+                return self.url_result(youtube_url, 'Youtube')
+
+            config = self._parse_json(self._html_search_regex(
+                r'data-player-config="([^"]+)"', webpage, 'data player config'),
                 video_id)
+            if 'playlist' not in config:
+                if 'vmapUrl' in config:
+                    vmap_data = self._download_xml(config['vmapUrl'], video_id)
+                    video_url = xpath_text(vmap_data, './/MediaFile').strip()
+                    formats.append({
+                        'url': video_url,
+                    })
+                    break   # same video regardless of UA
+                continue
 
             video_url = config['playlist'][0]['source']
 
@@ -70,3 +113,54 @@ class TwitterCardIE(InfoExtractor):
             'duration': duration,
             'formats': formats,
         }
+
+
+class TwitterIE(InfoExtractor):
+    IE_NAME = 'twitter'
+    _VALID_URL = r'https?://(?:www\.|m\.|mobile\.)?twitter\.com/(?P<user_id>[^/]+)/status/(?P<id>\d+)'
+    _TEMPLATE_URL = 'https://twitter.com/%s/status/%s'
+
+    _TEST = {
+        'url': 'https://twitter.com/freethenipple/status/643211948184596480',
+        'md5': '31cd83a116fc41f99ae3d909d4caf6a0',
+        'info_dict': {
+            'id': '643211948184596480',
+            'ext': 'mp4',
+            'title': 'FREE THE NIPPLE - FTN supporters on Hollywood Blvd today!',
+            'thumbnail': 're:^https?://.*\.jpg',
+            'duration': 12.922,
+            'description': 'FREE THE NIPPLE on Twitter: "FTN supporters on Hollywood Blvd today! http://t.co/c7jHH749xJ"',
+            'uploader': 'FREE THE NIPPLE',
+            'uploader_id': 'freethenipple',
+        },
+    }
+
+    def _real_extract(self, url):
+        mobj = re.match(self._VALID_URL, url)
+        user_id = mobj.group('user_id')
+        twid = mobj.group('id')
+
+        webpage = self._download_webpage(self._TEMPLATE_URL % (user_id, twid), twid)
+
+        username = remove_end(self._og_search_title(webpage), ' on Twitter')
+
+        title = self._og_search_description(webpage).strip('').replace('\n', ' ')
+
+        # strip  'https -_t.co_BJYgOjSeGA' junk from filenames
+        mobj = re.match(r'“(.*)\s+(https?://[^ ]+)”', title)
+        title, short_url = mobj.groups()
+
+        card_id = self._search_regex(
+            r'["\']/i/cards/tfw/v1/(\d+)', webpage, 'twitter card url')
+        card_url = 'https://twitter.com/i/cards/tfw/v1/' + card_id
+
+        return {
+            '_type': 'url_transparent',
+            'ie_key': 'TwitterCard',
+            'uploader_id': user_id,
+            'uploader': username,
+            'url': card_url,
+            'webpage_url': url,
+            'description': '%s on Twitter: "%s %s"' % (username, title, short_url),
+            'title': username + ' - ' + title,
+        }
index 078d283b23b42a8b2c668fbd40caebde0763581c..382517a4a52487422cf9ff9d045fd73134507972 100644 (file)
@@ -93,6 +93,10 @@ class VidmeIE(InfoExtractor):
         'params': {
             'skip_download': True,
         },
+    }, {
+        # nsfw, user-disabled
+        'url': 'https://vid.me/dzGJ',
+        'only_matching': True,
     }]
 
     def _real_extract(self, url):
@@ -114,6 +118,12 @@ class VidmeIE(InfoExtractor):
 
         video = response['video']
 
+        if video.get('state') == 'user-disabled':
+            raise ExtractorError(
+                'Vidme said: This video has been suspended either due to a copyright claim, '
+                'or for violating the terms of use.',
+                expected=True)
+
         formats = [{
             'format_id': f.get('type'),
             'url': f['uri'],
index 632e57fb4277dbf23c3fa21683f27c70d278f996..7cf930d699fd75ad564ead644a4371d401df137a 100644 (file)
@@ -131,10 +131,11 @@ class ViewsterIE(InfoExtractor):
                 formats.extend(self._extract_f4m_formats(
                     video_url, video_id, f4m_id='hds'))
             elif ext == 'm3u8':
-                formats.extend(self._extract_m3u8_formats(
+                m3u8_formats = self._extract_m3u8_formats(
                     video_url, video_id, 'mp4', m3u8_id='hls',
-                    fatal=False  # m3u8 sometimes fail
-                ))
+                    fatal=False)  # m3u8 sometimes fail
+                if m3u8_formats:
+                    formats.extend(m3u8_formats)
             else:
                 format_id = media.get('Bitrate')
                 f = {
index fa1b2204946a25f9cd2fb80ee4c7e2cf90779048..0f84656c0af8f0015a830b07f221bdb86e9f695e 100644 (file)
@@ -286,7 +286,17 @@ class VimeoIE(VimeoBaseInfoExtractor):
         try:
             try:
                 config_url = self._html_search_regex(
-                    r' data-config-url="(.+?)"', webpage, 'config URL')
+                    r' data-config-url="(.+?)"', webpage,
+                    'config URL', default=None)
+                if not config_url:
+                    # Sometimes new react-based page is served instead of old one that require
+                    # different config URL extraction approach (see
+                    # https://github.com/rg3/youtube-dl/pull/7209)
+                    vimeo_clip_page_config = self._search_regex(
+                        r'vimeo\.clip_page_config\s*=\s*({.+?});', webpage,
+                        'vimeo clip page config')
+                    config_url = self._parse_json(
+                        vimeo_clip_page_config, video_id)['player']['config_url']
                 config_json = self._download_webpage(config_url, video_id)
                 config = json.loads(config_json)
             except RegexNotFoundError:
index c733a48fa26edce6b219d3bf2404267b8c346bf6..be72f3147d5be525c25fe53063df43b337510ae4 100644 (file)
@@ -1,10 +1,14 @@
+# coding: utf-8
 from __future__ import unicode_literals
 
 import re
 import itertools
 
 from .common import InfoExtractor
-from ..utils import unified_strdate
+from ..utils import (
+    int_or_none,
+    unified_strdate,
+)
 
 
 class VineIE(InfoExtractor):
@@ -17,10 +21,12 @@ class VineIE(InfoExtractor):
             'ext': 'mp4',
             'title': 'Chicken.',
             'alt_title': 'Vine by Jack Dorsey',
-            'description': 'Chicken.',
             'upload_date': '20130519',
             'uploader': 'Jack Dorsey',
             'uploader_id': '76',
+            'like_count': int,
+            'comment_count': int,
+            'repost_count': int,
         },
     }, {
         'url': 'https://vine.co/v/MYxVapFvz2z',
@@ -29,11 +35,13 @@ class VineIE(InfoExtractor):
             'id': 'MYxVapFvz2z',
             'ext': 'mp4',
             'title': 'Fuck Da Police #Mikebrown #justice #ferguson #prayforferguson #protesting #NMOS14',
-            'alt_title': 'Vine by Luna',
-            'description': 'Fuck Da Police #Mikebrown #justice #ferguson #prayforferguson #protesting #NMOS14',
+            'alt_title': 'Vine by Mars Ruiz',
             'upload_date': '20140815',
-            'uploader': 'Luna',
+            'uploader': 'Mars Ruiz',
             'uploader_id': '1102363502380728320',
+            'like_count': int,
+            'comment_count': int,
+            'repost_count': int,
         },
     }, {
         'url': 'https://vine.co/v/bxVjBbZlPUH',
@@ -43,14 +51,33 @@ class VineIE(InfoExtractor):
             'ext': 'mp4',
             'title': '#mw3 #ac130 #killcam #angelofdeath',
             'alt_title': 'Vine by Z3k3',
-            'description': '#mw3 #ac130 #killcam #angelofdeath',
             'upload_date': '20130430',
             'uploader': 'Z3k3',
             'uploader_id': '936470460173008896',
+            'like_count': int,
+            'comment_count': int,
+            'repost_count': int,
         },
     }, {
         'url': 'https://vine.co/oembed/MYxVapFvz2z.json',
         'only_matching': True,
+    }, {
+        'url': 'https://vine.co/v/e192BnZnZ9V',
+        'info_dict': {
+            'id': 'e192BnZnZ9V',
+            'ext': 'mp4',
+            'title': 'ยิ้ม~ เขิน~ อาย~ น่าร้ากอ้ะ >//< @n_whitewo @orlameena #lovesicktheseries  #lovesickseason2',
+            'alt_title': 'Vine by Pimry_zaa',
+            'upload_date': '20150705',
+            'uploader': 'Pimry_zaa',
+            'uploader_id': '1135760698325307392',
+            'like_count': int,
+            'comment_count': int,
+            'repost_count': int,
+        },
+        'params': {
+            'skip_download': True,
+        },
     }]
 
     def _real_extract(self, url):
@@ -65,25 +92,26 @@ class VineIE(InfoExtractor):
 
         formats = [{
             'format_id': '%(format)s-%(rate)s' % f,
-            'vcodec': f['format'],
-            'quality': f['rate'],
+            'vcodec': f.get('format'),
+            'quality': f.get('rate'),
             'url': f['videoUrl'],
-        } for f in data['videoUrls']]
+        } for f in data['videoUrls'] if f.get('videoUrl')]
 
         self._sort_formats(formats)
 
+        username = data.get('username')
+
         return {
             'id': video_id,
-            'title': self._og_search_title(webpage),
-            'alt_title': self._og_search_description(webpage, default=None),
-            'description': data['description'],
-            'thumbnail': data['thumbnailUrl'],
-            'upload_date': unified_strdate(data['created']),
-            'uploader': data['username'],
-            'uploader_id': data['userIdStr'],
-            'like_count': data['likes']['count'],
-            'comment_count': data['comments']['count'],
-            'repost_count': data['reposts']['count'],
+            'title': data.get('description') or self._og_search_title(webpage),
+            'alt_title': 'Vine by %s' % username if username else self._og_search_description(webpage, default=None),
+            'thumbnail': data.get('thumbnailUrl'),
+            'upload_date': unified_strdate(data.get('created')),
+            'uploader': username,
+            'uploader_id': data.get('userIdStr'),
+            'like_count': int_or_none(data.get('likes', {}).get('count')),
+            'comment_count': int_or_none(data.get('comments', {}).get('count')),
+            'repost_count': int_or_none(data.get('reposts', {}).get('count')),
             'formats': formats,
         }
 
index b252e36e1162406dedfcc531d7d038e6bd357348..08e821362eaf04c9fac3fbb948774abe87554a20 100644 (file)
@@ -178,6 +178,52 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
             return
 
 
+class YoutubePlaylistBaseInfoExtractor(InfoExtractor):
+    # Extract the video ids from the playlist pages
+    def _entries(self, page, playlist_id):
+        more_widget_html = content_html = page
+        for page_num in itertools.count(1):
+            for video_id, video_title in self.extract_videos_from_page(content_html):
+                yield self.url_result(
+                    video_id, 'Youtube', video_id=video_id,
+                    video_title=video_title)
+
+            mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
+            if not mobj:
+                break
+
+            more = self._download_json(
+                'https://youtube.com/%s' % mobj.group('more'), playlist_id,
+                'Downloading page #%s' % page_num,
+                transform_source=uppercase_escape)
+            content_html = more['content_html']
+            if not content_html.strip():
+                # Some webpages show a "Load more" button but they don't
+                # have more videos
+                break
+            more_widget_html = more['load_more_widget_html']
+
+    def extract_videos_from_page(self, page):
+        ids_in_page = []
+        titles_in_page = []
+        for mobj in re.finditer(self._VIDEO_RE, page):
+            # The link with index 0 is not the first video of the playlist (not sure if still actual)
+            if 'index' in mobj.groupdict() and mobj.group('id') == '0':
+                continue
+            video_id = mobj.group('id')
+            video_title = unescapeHTML(mobj.group('title'))
+            if video_title:
+                video_title = video_title.strip()
+            try:
+                idx = ids_in_page.index(video_id)
+                if video_title and not titles_in_page[idx]:
+                    titles_in_page[idx] = video_title
+            except ValueError:
+                ids_in_page.append(video_id)
+                titles_in_page.append(video_title)
+        return zip(ids_in_page, titles_in_page)
+
+
 class YoutubeIE(YoutubeBaseInfoExtractor):
     IE_DESC = 'YouTube.com'
     _VALID_URL = r"""(?x)^
@@ -1419,7 +1465,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
         }
 
 
-class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
+class YoutubePlaylistIE(YoutubeBaseInfoExtractor, YoutubePlaylistBaseInfoExtractor):
     IE_DESC = 'YouTube.com playlists'
     _VALID_URL = r"""(?x)(?:
                         (?:https?://)?
@@ -1440,7 +1486,7 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
                         ((?:PL|LL|EC|UU|FL|RD|UL)[0-9A-Za-z-_]{10,})
                      )"""
     _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
-    _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
+    _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?'
     IE_NAME = 'youtube:playlist'
     _TESTS = [{
         'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
@@ -1557,37 +1603,11 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
             else:
                 self.report_warning('Youtube gives an alert message: ' + match)
 
-        # Extract the video ids from the playlist pages
-        def _entries():
-            more_widget_html = content_html = page
-            for page_num in itertools.count(1):
-                matches = re.finditer(self._VIDEO_RE, content_html)
-                # We remove the duplicates and the link with index 0
-                # (it's not the first video of the playlist)
-                new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
-                for vid_id in new_ids:
-                    yield self.url_result(vid_id, 'Youtube', video_id=vid_id)
-
-                mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
-                if not mobj:
-                    break
-
-                more = self._download_json(
-                    'https://youtube.com/%s' % mobj.group('more'), playlist_id,
-                    'Downloading page #%s' % page_num,
-                    transform_source=uppercase_escape)
-                content_html = more['content_html']
-                if not content_html.strip():
-                    # Some webpages show a "Load more" button but they don't
-                    # have more videos
-                    break
-                more_widget_html = more['load_more_widget_html']
-
         playlist_title = self._html_search_regex(
             r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
             page, 'title')
 
-        return self.playlist_result(_entries(), playlist_id, playlist_title)
+        return self.playlist_result(self._entries(page, playlist_id), playlist_id, playlist_title)
 
     def _real_extract(self, url):
         # Extract playlist id
@@ -1613,10 +1633,11 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
         return self._extract_playlist(playlist_id)
 
 
-class YoutubeChannelIE(InfoExtractor):
+class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor):
     IE_DESC = 'YouTube.com channels'
     _VALID_URL = r'https?://(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/(?P<id>[0-9A-Za-z_-]+)'
     _TEMPLATE_URL = 'https://www.youtube.com/channel/%s/videos'
+    _VIDEO_RE = r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?'
     IE_NAME = 'youtube:channel'
     _TESTS = [{
         'note': 'paginated channel',
@@ -1627,22 +1648,6 @@ class YoutubeChannelIE(InfoExtractor):
         }
     }]
 
-    @staticmethod
-    def extract_videos_from_page(page):
-        ids_in_page = []
-        titles_in_page = []
-        for mobj in re.finditer(r'(?:title="(?P<title>[^"]+)"[^>]+)?href="/watch\?v=(?P<id>[0-9A-Za-z_-]+)&?', page):
-            video_id = mobj.group('id')
-            video_title = unescapeHTML(mobj.group('title'))
-            try:
-                idx = ids_in_page.index(video_id)
-                if video_title and not titles_in_page[idx]:
-                    titles_in_page[idx] = video_title
-            except ValueError:
-                ids_in_page.append(video_id)
-                titles_in_page.append(video_title)
-        return zip(ids_in_page, titles_in_page)
-
     def _real_extract(self, url):
         channel_id = self._match_id(url)
 
@@ -1685,29 +1690,7 @@ class YoutubeChannelIE(InfoExtractor):
                 for video_id, video_title in self.extract_videos_from_page(channel_page)]
             return self.playlist_result(entries, channel_id)
 
-        def _entries():
-            more_widget_html = content_html = channel_page
-            for pagenum in itertools.count(1):
-
-                for video_id, video_title in self.extract_videos_from_page(content_html):
-                    yield self.url_result(
-                        video_id, 'Youtube', video_id=video_id,
-                        video_title=video_title)
-
-                mobj = re.search(
-                    r'data-uix-load-more-href="/?(?P<more>[^"]+)"',
-                    more_widget_html)
-                if not mobj:
-                    break
-
-                more = self._download_json(
-                    'https://youtube.com/%s' % mobj.group('more'), channel_id,
-                    'Downloading page #%s' % (pagenum + 1),
-                    transform_source=uppercase_escape)
-                content_html = more['content_html']
-                more_widget_html = more['load_more_widget_html']
-
-        return self.playlist_result(_entries(), channel_id)
+        return self.playlist_result(self._entries(channel_page, channel_id), channel_id)
 
 
 class YoutubeUserIE(YoutubeChannelIE):