X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Finstagram.py;h=a77f619d291ba4e02ecdc2c785795229fd0de2ca;hb=9cbd4dda10ad248a5268ec1e0e563cf97024a8b9;hp=5ebc30a1058f408db8a8a562258a016533579a09;hpb=a56e74e2713ed45f4096735cf49d1d97b5e75389;p=youtube-dl diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 5ebc30a10..a77f619d2 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -1,12 +1,13 @@ from __future__ import unicode_literals +import itertools import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( get_element_by_attribute, int_or_none, - limit_length, lowercase_escape, try_get, ) @@ -22,7 +23,7 @@ class InstagramIE(InfoExtractor): 'ext': 'mp4', 'title': 'Video by naomipq', 'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'timestamp': 1371748545, 'upload_date': '20130620', 'uploader_id': 'naomipq', @@ -38,7 +39,7 @@ class InstagramIE(InfoExtractor): 'id': 'BA-pQFBG8HZ', 'ext': 'mp4', 'title': 'Video by britneyspears', - 'thumbnail': 're:^https?://.*\.jpg', + 'thumbnail': r're:^https?://.*\.jpg', 'timestamp': 1453760977, 'upload_date': '20160125', 'uploader_id': 'britneyspears', @@ -50,6 +51,33 @@ class InstagramIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # multi video post + 'url': 'https://www.instagram.com/p/BQ0eAlwhDrw/', + 'playlist': [{ + 'info_dict': { + 'id': 'BQ0dSaohpPW', + 'ext': 'mp4', + 'title': 'Video 1', + }, + }, { + 'info_dict': { + 'id': 'BQ0dTpOhuHT', + 'ext': 'mp4', + 'title': 'Video 2', + }, + }, { + 'info_dict': { + 'id': 'BQ0dT7RBFeF', + 'ext': 'mp4', + 'title': 'Video 3', + }, + }], + 'info_dict': { + 'id': 'BQ0eAlwhDrw', + 'title': 'Post by instagram', + 'description': 'md5:0f9203fc6a2ce4d228da5754bcf54957', + }, }, { 'url': 'https://instagram.com/p/-Cmh1cukG2/', 'only_matching': True, @@ -84,7 +112,8 @@ class InstagramIE(InfoExtractor): webpage = self._download_webpage(url, video_id) (video_url, description, thumbnail, timestamp, uploader, - uploader_id, like_count, comment_count) = [None] * 8 + uploader_id, like_count, comment_count, comments, height, + width) = [None] * 11 shared_data = self._parse_json( self._search_regex( @@ -93,28 +122,73 @@ class InstagramIE(InfoExtractor): video_id, fatal=False) if shared_data: media = try_get( - shared_data, lambda x: x['entry_data']['PostPage'][0]['media'], dict) + shared_data, + (lambda x: x['entry_data']['PostPage'][0]['graphql']['shortcode_media'], + lambda x: x['entry_data']['PostPage'][0]['media']), + dict) if media: video_url = media.get('video_url') - description = media.get('caption') + height = int_or_none(media.get('dimensions', {}).get('height')) + width = int_or_none(media.get('dimensions', {}).get('width')) + description = try_get( + media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'], + compat_str) or media.get('caption') thumbnail = media.get('display_src') - timestamp = int_or_none(media.get('date')) + timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date')) uploader = media.get('owner', {}).get('full_name') uploader_id = media.get('owner', {}).get('username') - like_count = int_or_none(media.get('likes', {}).get('count')) - comment_count = int_or_none(media.get('comments', {}).get('count')) + + def get_count(key, kind): + return int_or_none(try_get( + media, (lambda x: x['edge_media_%s' % key]['count'], + lambda x: x['%ss' % kind]['count']))) + like_count = get_count('preview_like', 'like') + comment_count = get_count('to_comment', 'comment') + comments = [{ 'author': comment.get('user', {}).get('username'), 'author_id': comment.get('user', {}).get('id'), 'id': comment.get('id'), 'text': comment.get('text'), 'timestamp': int_or_none(comment.get('created_at')), - } for comment in media.get('comments', {}).get('nodes', []) - if comment.get('text')] + } for comment in media.get( + 'comments', {}).get('nodes', []) if comment.get('text')] + if not video_url: + edges = try_get( + media, lambda x: x['edge_sidecar_to_children']['edges'], + list) or [] + if edges: + entries = [] + for edge_num, edge in enumerate(edges, start=1): + node = try_get(edge, lambda x: x['node'], dict) + if not node: + continue + node_video_url = try_get(node, lambda x: x['video_url'], compat_str) + if not node_video_url: + continue + entries.append({ + 'id': node.get('shortcode') or node['id'], + 'title': 'Video %d' % edge_num, + 'url': node_video_url, + 'thumbnail': node.get('display_url'), + 'width': int_or_none(try_get(node, lambda x: x['dimensions']['width'])), + 'height': int_or_none(try_get(node, lambda x: x['dimensions']['height'])), + 'view_count': int_or_none(node.get('video_view_count')), + }) + return self.playlist_result( + entries, video_id, + 'Post by %s' % uploader_id if uploader_id else None, + description) if not video_url: video_url = self._og_search_video_url(webpage, secure=False) + formats = [{ + 'url': video_url, + 'width': width, + 'height': height, + }] + if not uploader_id: uploader_id = self._search_regex( r'"owner"\s*:\s*{\s*"username"\s*:\s*"(.+?)"', @@ -131,7 +205,7 @@ class InstagramIE(InfoExtractor): return { 'id': video_id, - 'url': video_url, + 'formats': formats, 'ext': 'mp4', 'title': 'Video by %s' % uploader_id, 'description': description, @@ -146,7 +220,7 @@ class InstagramIE(InfoExtractor): class InstagramUserIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P[^/]{2,})/?(?:$|[?#])' + _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P[^/]{2,})/?(?:$|[?#])' IE_DESC = 'Instagram user profile' IE_NAME = 'instagram:user' _TEST = { @@ -155,82 +229,79 @@ class InstagramUserIE(InfoExtractor): 'id': 'porsche', 'title': 'porsche', }, - 'playlist_mincount': 2, - 'playlist': [{ - 'info_dict': { - 'id': '614605558512799803_462752227', - 'ext': 'mp4', - 'title': '#Porsche Intelligent Performance.', - 'thumbnail': 're:^https?://.*\.jpg', - 'uploader': 'Porsche', - 'uploader_id': 'porsche', - 'timestamp': 1387486713, - 'upload_date': '20131219', - }, - }], + 'playlist_count': 5, 'params': { 'extract_flat': True, 'skip_download': True, + 'playlistend': 5, } } - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - uploader_id = mobj.group('username') + def _entries(self, uploader_id): + query = { + '__a': 1, + } + + def get_count(kind): + return int_or_none(try_get( + node, lambda x: x['%ss' % kind]['count'])) - entries = [] - page_count = 0 - media_url = 'http://instagram.com/%s/media' % uploader_id - while True: + for page_num in itertools.count(1): page = self._download_json( - media_url, uploader_id, - note='Downloading page %d ' % (page_count + 1), - ) - page_count += 1 + 'https://instagram.com/%s/' % uploader_id, uploader_id, + note='Downloading page %d' % page_num, + fatal=False, query=query) + if not page: + break + + nodes = try_get(page, lambda x: x['user']['media']['nodes'], list) + if not nodes: + break + + max_id = None + + for node in nodes: + node_id = node.get('id') + if node_id: + max_id = node_id - for it in page['items']: - if it.get('type') != 'video': + if node.get('__typename') != 'GraphVideo' and node.get('is_video') is not True: continue - like_count = int_or_none(it.get('likes', {}).get('count')) - user = it.get('user', {}) - - formats = [{ - 'format_id': k, - 'height': v.get('height'), - 'width': v.get('width'), - 'url': v['url'], - } for k, v in it['videos'].items()] - self._sort_formats(formats) - - thumbnails_el = it.get('images', {}) - thumbnail = thumbnails_el.get('thumbnail', {}).get('url') - - # In some cases caption is null, which corresponds to None - # in python. As a result, it.get('caption', {}) gives None - title = (it.get('caption') or {}).get('text', it['id']) - - entries.append({ - 'id': it['id'], - 'title': limit_length(title, 80), - 'formats': formats, + video_id = node.get('code') + if not video_id: + continue + + info = self.url_result( + 'https://instagram.com/p/%s/' % video_id, + ie=InstagramIE.ie_key(), video_id=video_id) + + description = try_get( + node, [lambda x: x['caption'], lambda x: x['text']['id']], + compat_str) + thumbnail = node.get('thumbnail_src') or node.get('display_src') + timestamp = int_or_none(node.get('date')) + + comment_count = get_count('comment') + like_count = get_count('like') + view_count = int_or_none(node.get('video_views')) + + info.update({ + 'description': description, 'thumbnail': thumbnail, - 'webpage_url': it.get('link'), - 'uploader': user.get('full_name'), - 'uploader_id': user.get('username'), + 'timestamp': timestamp, + 'comment_count': comment_count, 'like_count': like_count, - 'timestamp': int_or_none(it.get('created_time')), + 'view_count': view_count, }) - if not page['items']: + yield info + + if not max_id: break - max_id = page['items'][-1]['id'].split('_')[0] - media_url = ( - 'http://instagram.com/%s/media?max_id=%s' % ( - uploader_id, max_id)) - return { - '_type': 'playlist', - 'entries': entries, - 'id': uploader_id, - 'title': uploader_id, - } + query['max_id'] = max_id + + def _real_extract(self, url): + uploader_id = self._match_id(url) + return self.playlist_result( + self._entries(uploader_id), uploader_id, uploader_id)