X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Ftwitter.py;h=64c429f02c61b1a1fea49c7900e93757740bc28d;hb=acb6e97e6a039b7bcce952e9fcf71a5ccf92aec3;hp=1cdca544cb31368c09fda9b05d501ab2fa827dcf;hpb=77a54b6a658059a11de415d793588fdbfec14194;p=youtube-dl diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py index 1cdca544c..64c429f02 100644 --- a/youtube_dl/extractor/twitter.py +++ b/youtube_dl/extractor/twitter.py @@ -4,21 +4,29 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urllib_request from ..utils import ( float_or_none, xpath_text, remove_end, + int_or_none, + ExtractorError, + sanitized_Request, ) -class TwitterCardIE(InfoExtractor): +class TwitterBaseIE(InfoExtractor): + def _get_vmap_video_url(self, vmap_url, video_id): + vmap_data = self._download_xml(vmap_url, video_id) + return xpath_text(vmap_data, './/MediaFile').strip() + + +class TwitterCardIE(TwitterBaseIE): IE_NAME = 'twitter:card' _VALID_URL = r'https?://(?:www\.)?twitter\.com/i/cards/tfw/v1/(?P\d+)' _TESTS = [ { 'url': 'https://twitter.com/i/cards/tfw/v1/560070183650213889', - 'md5': '7d2f6b4d2eb841a7ccc893d479bfceb4', + # MD5 checksums are different in different places 'info_dict': { 'id': '560070183650213889', 'ext': 'mp4', @@ -37,6 +45,33 @@ class TwitterCardIE(InfoExtractor): 'thumbnail': 're:^https?://.*\.jpg', 'duration': 80.155, }, + }, + { + 'url': 'https://twitter.com/i/cards/tfw/v1/654001591733886977', + 'md5': 'd4724ffe6d2437886d004fa5de1043b3', + 'info_dict': { + 'id': 'dq4Oj5quskI', + 'ext': 'mp4', + 'title': 'Ubuntu 11.10 Overview', + 'description': 'Take a quick peek at what\'s new and improved in Ubuntu 11.10.\n\nOnce installed take a look at 10 Things to Do After Installing: http://www.omgubuntu.co.uk/2011/10/10-things-to-do-after-installing-ubuntu-11-10/', + 'upload_date': '20111013', + 'uploader': 'OMG! Ubuntu!', + 'uploader_id': 'omgubuntu', + }, + 'add_ie': ['Youtube'], + }, + { + 'url': 'https://twitter.com/i/cards/tfw/v1/665289828897005568', + 'md5': 'ab2745d0b0ce53319a534fccaa986439', + 'info_dict': { + 'id': 'iBb2x00UVlv', + 'ext': 'mp4', + 'upload_date': '20151113', + 'uploader_id': '1189339351084113920', + 'uploader': 'ArsenalTerje', + 'title': 'Vine by ArsenalTerje', + }, + 'add_ie': ['Vine'], } ] @@ -52,19 +87,23 @@ class TwitterCardIE(InfoExtractor): config = None formats = [] for user_agent in USER_AGENTS: - request = compat_urllib_request.Request(url) + request = sanitized_Request(url) request.add_header('User-Agent', user_agent) webpage = self._download_webpage(request, video_id) + iframe_url = self._html_search_regex( + r']+src="((?:https?:)?//(?:www.youtube.com/embed/[^"]+|(?:www\.)?vine\.co/v/\w+/card))"', + webpage, 'video iframe', default=None) + if iframe_url: + return self.url_result(iframe_url) + config = self._parse_json(self._html_search_regex( r'data-player-config="([^"]+)"', webpage, 'data player config'), video_id) if 'playlist' not in config: if 'vmapUrl' in config: - vmap_data = self._download_xml(config['vmapUrl'], video_id) - video_url = xpath_text(vmap_data, './/MediaFile').strip() formats.append({ - 'url': video_url, + 'url': self._get_vmap_video_url(config['vmapUrl'], video_id), }) break # same video regardless of UA continue @@ -101,9 +140,9 @@ class TwitterIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.|m\.|mobile\.)?twitter\.com/(?P[^/]+)/status/(?P\d+)' _TEMPLATE_URL = 'https://twitter.com/%s/status/%s' - _TEST = { + _TESTS = [{ 'url': 'https://twitter.com/freethenipple/status/643211948184596480', - 'md5': '31cd83a116fc41f99ae3d909d4caf6a0', + # MD5 checksums are different in different places 'info_dict': { 'id': '643211948184596480', 'ext': 'mp4', @@ -114,7 +153,30 @@ class TwitterIE(InfoExtractor): 'uploader': 'FREE THE NIPPLE', 'uploader_id': 'freethenipple', }, - } + }, { + 'url': 'https://twitter.com/giphz/status/657991469417025536/photo/1', + 'md5': 'f36dcd5fb92bf7057f155e7d927eeb42', + 'info_dict': { + 'id': '657991469417025536', + 'ext': 'mp4', + 'title': 'Gifs - tu vai cai tu vai cai tu nao eh capaz disso tu vai cai', + 'description': 'Gifs on Twitter: "tu vai cai tu vai cai tu nao eh capaz disso tu vai cai https://t.co/tM46VHFlO5"', + 'thumbnail': 're:^https?://.*\.png', + 'uploader': 'Gifs', + 'uploader_id': 'giphz', + }, + }, { + 'url': 'https://twitter.com/starwars/status/665052190608723968', + 'md5': '39b7199856dee6cd4432e72c74bc69d4', + 'info_dict': { + 'id': '665052190608723968', + 'ext': 'mp4', + 'title': 'Star Wars - A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens.', + 'description': 'Star Wars on Twitter: "A new beginning is coming December 18. Watch the official 60 second #TV spot for #StarWars: #TheForceAwakens."', + 'uploader_id': 'starwars', + 'uploader': 'Star Wars', + }, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -125,23 +187,75 @@ class TwitterIE(InfoExtractor): username = remove_end(self._og_search_title(webpage), ' on Twitter') - title = self._og_search_description(webpage).strip('').replace('\n', ' ') + title = description = self._og_search_description(webpage).strip('').replace('\n', ' ').strip('“”') # strip 'https -_t.co_BJYgOjSeGA' junk from filenames - mobj = re.match(r'“(.*)\s+(http://[^ ]+)”', title) - title, short_url = mobj.groups() - - card_id = self._search_regex( - r'["\']/i/cards/tfw/v1/(\d+)', webpage, 'twitter card url') - card_url = 'https://twitter.com/i/cards/tfw/v1/' + card_id + title = re.sub(r'\s+(https?://[^ ]+)', '', title) - return { - '_type': 'url_transparent', - 'ie_key': 'TwitterCard', + info = { 'uploader_id': user_id, 'uploader': username, - 'url': card_url, 'webpage_url': url, - 'description': '%s on Twitter: "%s %s"' % (username, title, short_url), + 'description': '%s on Twitter: "%s"' % (username, description), 'title': username + ' - ' + title, } + + card_id = self._search_regex( + r'["\']/i/cards/tfw/v1/(\d+)', webpage, 'twitter card url', default=None) + if card_id: + card_url = 'https://twitter.com/i/cards/tfw/v1/' + card_id + info.update({ + '_type': 'url_transparent', + 'ie_key': 'TwitterCard', + 'url': card_url, + }) + return info + + mobj = re.search(r'''(?x) + ]+class="animated-gif"[^>]+ + (?:data-height="(?P\d+)")?[^>]+ + (?:data-width="(?P\d+)")?[^>]+ + (?:poster="(?P[^"]+)")?[^>]*>\s* + ]+video-src="(?P[^"]+)" + ''', webpage) + + if mobj: + info.update({ + 'id': twid, + 'url': mobj.group('url'), + 'height': int_or_none(mobj.group('height')), + 'width': int_or_none(mobj.group('width')), + 'thumbnail': mobj.group('poster'), + }) + return info + + raise ExtractorError('There\'s not video in this tweet.') + + +class TwitterAmplifyIE(TwitterBaseIE): + IE_NAME = 'twitter:amplify' + _VALID_URL = 'https?://amp\.twimg\.com/v/(?P[0-9a-f\-]{36})' + + _TEST = { + 'url': 'https://amp.twimg.com/v/0ba0c3c7-0af3-4c0a-bed5-7efd1ffa2951', + 'md5': '7df102d0b9fd7066b86f3159f8e81bf6', + 'info_dict': { + 'id': '0ba0c3c7-0af3-4c0a-bed5-7efd1ffa2951', + 'ext': 'mp4', + 'title': 'Twitter Video', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + vmap_url = self._html_search_meta( + 'twitter:amplify:vmap', webpage, 'vmap url') + video_url = self._get_vmap_video_url(vmap_url, video_id) + + return { + 'id': video_id, + 'title': 'Twitter Video', + 'url': video_url, + }