X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Ftwitter.py;h=a161f046b2532805d864a26e083de06f68cf7a1f;hb=5c2266df4b9aeb7881ed8c026a038e2a25e43734;hp=1472f22a71bec6a22e133529c6a31642a9d220d1;hpb=c88aec845a680ef9404b637b3dbcf706dcf00b68;p=youtube-dl diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 1472f22a7..a161f046b 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -4,11 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_request from ..utils import ( float_or_none, xpath_text, remove_end, + int_or_none, + ExtractorError, + sanitized_Request, ) @@ -18,7 +20,7 @@ class TwitterCardIE(InfoExtractor): _TESTS = [ { 'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889', - 'md5': '7d2f6b4d2eb841a7ccc893d479bfceb4', + 'md5': '4fa26a35f9d1bf4b646590ba8e84be19', 'info_dict': { 'id': '560070183650213889', 'ext': 'mp4', @@ -37,6 +39,33 @@ class TwitterCardIE(InfoExtractor): 'thumbnail': 're:^https?://.*\.jpg', 'duration': 80.155, }, + }, + { + 'url': 'https://twitter.com/i/cards/tfw/v1/654001591733886977', + 'md5': 'b6f35e8b08a0bec6c8af77a2f4b3a814', + 'info_dict': { + 'id': 'dq4Oj5quskI', + 'ext': 'mp4', + 'title': 'Ubuntu 11.10 Overview', + 'description': 'Take a quick peek at what\'s new and improved in Ubuntu 11.10.\n\nOnce installed take a look at 10 Things to Do After Installing: http://www.omgubuntu.co.uk/2011/10/10-things-to-do-after-installing-ubuntu-11-10/', + 'upload_date': '20111013', + 'uploader': 'OMG! Ubuntu!', + 'uploader_id': 'omgubuntu', + }, + 'add_ie': ['Youtube'], + }, + { + 'url': 'https://twitter.com/i/cards/tfw/v1/665289828897005568', + 'md5': 'ab2745d0b0ce53319a534fccaa986439', + 'info_dict': { + 'id': 'iBb2x00UVlv', + 'ext': 'mp4', + 'upload_date': '20151113', + 'uploader_id': '1189339351084113920', + 'uploader': '@ArsenalTerje', + 'title': 'Vine by @ArsenalTerje', + }, + 'add_ie': ['Vine'], } ] @@ -52,10 +81,16 @@ class TwitterCardIE(InfoExtractor): config = None formats = [] for user_agent in USER_AGENTS: - request = compat_urllib_request.Request(url) + request = sanitized_Request(url) request.add_header('User-Agent', user_agent) webpage = self._download_webpage(request, video_id) + iframe_url = self._html_search_regex( + r']+src="((?:https?:)?//(?:www.youtube.com/embed/[^"]+|(?:www\.)?vine\.co/v/\w+/card))"', + webpage, 'video iframe', default=None) + if iframe_url: + return self.url_result(iframe_url) + config = self._parse_json(self._html_search_regex( r'data-player-config="([^"]+)"', webpage, 'data player config'), video_id) @@ -101,9 +136,9 @@ class TwitterIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.|m\.|mobile\.)?twitter\.com/(?P[^/]+)/status/(?P\d+)' _TEMPLATE_URL = 'https://twitter.com/%s/status/%s' - _TEST = { + _TESTS = [{ 'url': 'https://twitter.com/freethenipple/status/643211948184596480', - 'md5': '31cd83a116fc41f99ae3d909d4caf6a0', + 'md5': 'db6612ec5d03355953c3ca9250c97e5e', 'info_dict': { 'id': '643211948184596480', 'ext': 'mp4', @@ -114,7 +149,30 @@ class TwitterIE(InfoExtractor): 'uploader': 'FREE THE NIPPLE', 'uploader_id': 'freethenipple', }, - } + }, { + 'url': 'https://twitter.com/giphz/status/657991469417025536/photo/1', + 'md5': 'f36dcd5fb92bf7057f155e7d927eeb42', + 'info_dict': { + 'id': '657991469417025536', + 'ext': 'mp4', + 'title': 'Gifs - tu vai cai tu vai cai tu nao eh capaz disso tu vai cai', + 'description': 'Gifs on Twitter: "tu vai cai tu vai cai tu nao eh capaz disso tu vai cai https://t.co/tM46VHFlO5"', + 'thumbnail': 're:^https?://.*\.png', + 'uploader': 'Gifs', + 'uploader_id': 'giphz', + }, + }, { + 'url': 'https://twitter.com/starwars/status/665052190608723968', + 'md5': '39b7199856dee6cd4432e72c74bc69d4', + 'info_dict': { + 'id': '665052190608723968', + 'ext': 'mp4', + 'title': 'Star Wars - A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens.', + 'description': 'Star Wars on Twitter: "A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens."', + 'uploader_id': 'starwars', + 'uploader': 'Star Wars', + }, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -125,23 +183,46 @@ class TwitterIE(InfoExtractor): username = remove_end(self._og_search_title(webpage), ' on Twitter') - title = self._og_search_description(webpage).strip('').replace('\n', ' ') + title = description = self._og_search_description(webpage).strip('').replace('\n', ' ').strip('“”') # strip 'https -_t.co_BJYgOjSeGA' junk from filenames - mobj = re.match(r'“(.*)\s+(https?://[^ ]+)”', title) - title, short_url = mobj.groups() + title = re.sub(r'\s+(https?://[^ ]+)', '', title) - card_id = self._search_regex( - r'["\']/i/cards/tfw/v1/(\d+)', webpage, 'twitter card url') - card_url = 'https://twitter.com/i/cards/tfw/v1/' + card_id - - return { - '_type': 'url_transparent', - 'ie_key': 'TwitterCard', + info = { 'uploader_id': user_id, 'uploader': username, - 'url': card_url, 'webpage_url': url, - 'description': '%s on Twitter: "%s %s"' % (username, title, short_url), + 'description': '%s on Twitter: "%s"' % (username, description), 'title': username + ' - ' + title, } + + card_id = self._search_regex( + r'["\']/i/cards/tfw/v1/(\d+)', webpage, 'twitter card url', default=None) + if card_id: + card_url = 'https://twitter.com/i/cards/tfw/v1/' + card_id + info.update({ + '_type': 'url_transparent', + 'ie_key': 'TwitterCard', + 'url': card_url, + }) + return info + + mobj = re.search(r'''(?x) + ]+class="animated-gif"[^>]+ + (?:data-height="(?P\d+)")?[^>]+ + (?:data-width="(?P\d+)")?[^>]+ + (?:poster="(?P[^"]+)")?[^>]*>\s* + ]+video-src="(?P[^"]+)" + ''', webpage) + + if mobj: + info.update({ + 'id': twid, + 'url': mobj.group('url'), + 'height': int_or_none(mobj.group('height')), + 'width': int_or_none(mobj.group('width')), + 'thumbnail': mobj.group('poster'), + }) + return info + + raise ExtractorError('There\'s not video in this tweet.')