X-Git-Url: http://git.bitcoin.ninja/index.cgi?a=blobdiff_plain;f=youtube_dl%2Fextractor%2Finstagram.py;h=3cbe77ad80f2fc9a03c738745524d5dac98c9d37;hb=2c347352677f023678ffd488a51b19f54b97fa36;hp=5109f26ce860edc0675eaba6350e0ab820e7fe27;hpb=9b583dca4cf3b623323de8fadf6dc851b7111fd2;p=youtube-dl diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 5109f26ce..3cbe77ad8 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -4,14 +4,17 @@ import re from .common import InfoExtractor from ..utils import ( + get_element_by_attribute, int_or_none, + limit_length, + lowercase_escape, ) class InstagramIE(InfoExtractor): - _VALID_URL = r'http://instagram\.com/p/(?P.*?)/' - _TEST = { - 'url': 'http://instagram.com/p/aye83DjauH/?foo=bar#abc', + _VALID_URL = r'(?Phttps?://(?:www\.)?instagram\.com/p/(?P[^/?#&]+))' + _TESTS = [{ + 'url': 'https://instagram.com/p/aye83DjauH/?foo=bar#abc', 'md5': '0d2da106a9d2631273e192b372806516', 'info_dict': { 'id': 'aye83DjauH', @@ -20,16 +23,56 @@ class InstagramIE(InfoExtractor): 'title': 'Video by naomipq', 'description': 'md5:1f17f0ab29bd6fe2bfad705f58de3cb8', } - } + }, { + # missing description + 'url': 'https://www.instagram.com/p/BA-pQFBG8HZ/?taken-by=britneyspears', + 'info_dict': { + 'id': 'BA-pQFBG8HZ', + 'ext': 'mp4', + 'uploader_id': 'britneyspears', + 'title': 'Video by britneyspears', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://instagram.com/p/-Cmh1cukG2/', + 'only_matching': True, + }, { + 'url': 'http://instagram.com/p/9o6LshA7zy/embed/', + 'only_matching': True, + }] + + @staticmethod + def _extract_embed_url(webpage): + mobj = re.search( + r']+src=(["\'])(?P(?:https?:)?//(?:www\.)?instagram\.com/p/[^/]+/embed.*?)\1', + webpage) + if mobj: + return mobj.group('url') + + blockquote_el = get_element_by_attribute( + 'class', 'instagram-media', webpage) + if blockquote_el is None: + return + + mobj = re.search( + r']+href=([\'"])(?P[^\'"]+)\1', blockquote_el) + if mobj: + return mobj.group('link') def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') + url = mobj.group('url') + webpage = self._download_webpage(url, video_id) uploader_id = self._search_regex(r'"owner":{"username":"(.+?)"', - webpage, 'uploader id', fatal=False) - desc = self._search_regex(r'"caption":"(.*?)"', webpage, 'description', - fatal=False) + webpage, 'uploader id', fatal=False) + desc = self._search_regex( + r'"caption":"(.+?)"', webpage, 'description', default=None) + if desc is not None: + desc = lowercase_escape(desc) return { 'id': video_id, @@ -43,11 +86,11 @@ class InstagramIE(InfoExtractor): class InstagramUserIE(InfoExtractor): - _VALID_URL = r'http://instagram\.com/(?P[^/]{2,})/?(?:$|[?#])' + _VALID_URL = r'https?://(?:www\.)?instagram\.com/(?P[^/]{2,})/?(?:$|[?#])' IE_DESC = 'Instagram user profile' IE_NAME = 'instagram:user' _TEST = { - 'url': 'http://instagram.com/porsche', + 'url': 'https://instagram.com/porsche', 'info_dict': { 'id': 'porsche', 'title': 'porsche', @@ -102,11 +145,13 @@ class InstagramUserIE(InfoExtractor): thumbnails_el = it.get('images', {}) thumbnail = thumbnails_el.get('thumbnail', {}).get('url') - title = it.get('caption', {}).get('text', it['id']) + # In some cases caption is null, which corresponds to None + # in python. As a result, it.get('caption', {}) gives None + title = (it.get('caption') or {}).get('text', it['id']) entries.append({ 'id': it['id'], - 'title': title, + 'title': limit_length(title, 80), 'formats': formats, 'thumbnail': thumbnail, 'webpage_url': it.get('link'), @@ -118,7 +163,7 @@ class InstagramUserIE(InfoExtractor): if not page['items']: break - max_id = page['items'][-1]['id'] + max_id = page['items'][-1]['id'].split('_')[0] media_url = ( 'http://instagram.com/%s/media?max_id=%s' % ( uploader_id, max_id))