[instagram:user] Add pagination (closes #15934)

[youtube-dl] / youtube_dl / extractor / instagram.py
diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py

index 20db31f866c89a10e1772baa27dd7b90eba31e21..f9cd11b8e3c0ca00b8a8f6aa8eb4960ff302d690 100644 (file)
--- a/youtube_dl/extractor/instagram.py
+++ b/youtube_dl/extractor/instagram.py
@@ -1,6 +1,7 @@
  from __future__ import unicode_literals
  
  import itertools
+import json
  import re
  
  from .common import InfoExtractor
@@ -130,13 +131,21 @@ class InstagramIE(InfoExtractor):
                  video_url = media.get('video_url')
                  height = int_or_none(media.get('dimensions', {}).get('height'))
                  width = int_or_none(media.get('dimensions', {}).get('width'))
-                description = media.get('caption')
+                description = try_get(
+                    media, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'],
+                    compat_str) or media.get('caption')
                  thumbnail = media.get('display_src')
-                timestamp = int_or_none(media.get('date'))
+                timestamp = int_or_none(media.get('taken_at_timestamp') or media.get('date'))
                  uploader = media.get('owner', {}).get('full_name')
                  uploader_id = media.get('owner', {}).get('username')
-                like_count = int_or_none(media.get('likes', {}).get('count'))
-                comment_count = int_or_none(media.get('comments', {}).get('count'))
+
+                def get_count(key, kind):
+                    return int_or_none(try_get(
+                        media, (lambda x: x['edge_media_%s' % key]['count'],
+                                lambda x: x['%ss' % kind]['count'])))
+                like_count = get_count('preview_like', 'like')
+                comment_count = get_count('to_comment', 'comment')
+
                  comments = [{
                      'author': comment.get('user', {}).get('username'),
                      'author_id': comment.get('user', {}).get('id'),
@@ -230,36 +239,34 @@ class InstagramUserIE(InfoExtractor):
      }
  
      def _entries(self, uploader_id):
-        query = {
-            '__a': 1,
-        }
-
-        def get_count(kind):
+        def get_count(suffix):
              return int_or_none(try_get(
-                node, lambda x: x['%ss' % kind]['count']))
+                node, lambda x: x['edge_media_' + suffix]['count']))
  
+        cursor = ''
          for page_num in itertools.count(1):
-            page = self._download_json(
-                'https://instagram.com/%s/' % uploader_id, uploader_id,
-                note='Downloading page %d' % page_num,
-                fatal=False, query=query)
-            if not page:
-                break
-
-            nodes = try_get(page, lambda x: x['user']['media']['nodes'], list)
-            if not nodes:
+            media = self._download_json(
+                'https://www.instagram.com/graphql/query/', uploader_id,
+                'Downloading JSON page %d' % page_num, query={
+                    'query_hash': '472f257a40c653c64c666ce877d59d2b',
+                    'variables': json.dumps({
+                        'id': uploader_id,
+                        'first': 100,
+                        'after': cursor,
+                    })
+                })['data']['user']['edge_owner_to_timeline_media']
+
+            edges = media.get('edges')
+            if not edges or not isinstance(edges, list):
                  break
  
-            max_id = None
-
-            for node in nodes:
-                node_id = node.get('id')
-                if node_id:
-                    max_id = node_id
-
+            for edge in edges:
+                node = edge.get('node')
+                if not node or not isinstance(node, dict):
+                    continue
                  if node.get('__typename') != 'GraphVideo' and node.get('is_video') is not True:
                      continue
-                video_id = node.get('code')
+                video_id = node.get('shortcode')
                  if not video_id:
                      continue
  
@@ -268,14 +275,14 @@ class InstagramUserIE(InfoExtractor):
                      ie=InstagramIE.ie_key(), video_id=video_id)
  
                  description = try_get(
-                    node, [lambda x: x['caption'], lambda x: x['text']['id']],
+                    node, lambda x: x['edge_media_to_caption']['edges'][0]['node']['text'],
                      compat_str)
                  thumbnail = node.get('thumbnail_src') or node.get('display_src')
-                timestamp = int_or_none(node.get('date'))
+                timestamp = int_or_none(node.get('taken_at_timestamp'))
  
-                comment_count = get_count('comment')
-                like_count = get_count('like')
-                view_count = int_or_none(node.get('video_views'))
+                comment_count = get_count('to_comment')
+                like_count = get_count('preview_like')
+                view_count = int_or_none(node.get('video_view_count'))
  
                  info.update({
                      'description': description,
@@ -288,12 +295,23 @@ class InstagramUserIE(InfoExtractor):
  
                  yield info
  
-            if not max_id:
+            page_info = media.get('page_info')
+            if not page_info or not isinstance(page_info, dict):
                  break
  
-            query['max_id'] = max_id
+            has_next_page = page_info.get('has_next_page')
+            if not has_next_page:
+                break
+
+            cursor = page_info.get('end_cursor')
+            if not cursor or not isinstance(cursor, compat_str):
+                break
  
      def _real_extract(self, url):
-        uploader_id = self._match_id(url)
+        username = self._match_id(url)
+        uploader_id = self._download_json(
+            'https://instagram.com/%s/' % username, username, query={
+                '__a': 1,
+            })['graphql']['user']['id']
          return self.playlist_result(
-            self._entries(uploader_id), uploader_id, uploader_id)
+            self._entries(uploader_id), username, username)