[iconosquare] fix info extraction
[youtube-dl] / youtube_dl / extractor / iconosquare.py
1 from __future__ import unicode_literals
2
3 from .common import InfoExtractor
4 from ..utils import (
5     int_or_none,
6     get_element_by_id,
7 )
8
9
10 class IconosquareIE(InfoExtractor):
11     _VALID_URL = r'https?://(?:www\.)?(?:iconosquare\.com|statigr\.am)/p/(?P<id>[^/]+)'
12     _TEST = {
13         'url': 'http://statigr.am/p/522207370455279102_24101272',
14         'md5': '6eb93b882a3ded7c378ee1d6884b1814',
15         'info_dict': {
16             'id': '522207370455279102_24101272',
17             'ext': 'mp4',
18             'title': 'A little over a year ago, I posted my first #dailycortado, a drink introduced to...',
19             'description': 'md5:644406a9ec27457ed7aa7a9ebcd4ce3d',
20             'timestamp': 1376471991,
21             'upload_date': '20130814',
22             'uploader': 'aguynamedpatrick',
23             'uploader_id': '24101272',
24             'comment_count': int,
25             'like_count': int,
26         },
27     }
28
29     def _real_extract(self, url):
30         video_id = self._match_id(url)
31
32         webpage = self._download_webpage(url, video_id)
33
34         media = self._parse_json(
35             get_element_by_id('mediaJson', webpage),
36             video_id)
37
38         formats = [{
39             'url': f['url'],
40             'format_id': format_id,
41             'width': int_or_none(f.get('width')),
42             'height': int_or_none(f.get('height'))
43         } for format_id, f in media['videos'].items()]
44         self._sort_formats(formats)
45
46         title = self._html_search_regex(
47             r'<title>(.+?)</title>',
48             webpage, 'title')
49
50         timestamp = int_or_none(media.get('created_time') or media.get('caption', {}).get('created_time'))
51         description = media.get('caption', {}).get('text')
52
53         uploader = media.get('user', {}).get('username')
54         uploader_id = media.get('user', {}).get('id')
55
56         comment_count = int_or_none(media.get('comments', {}).get('count'))
57         like_count = int_or_none(media.get('likes', {}).get('count'))
58
59         thumbnails = [{
60             'url': t['url'],
61             'id': thumbnail_id,
62             'width': int_or_none(t.get('width')),
63             'height': int_or_none(t.get('height'))
64         } for thumbnail_id, t in media.get('images', {}).items()]
65
66         return {
67             'id': video_id,
68             'title': title,
69             'description': description,
70             'thumbnails': thumbnails,
71             'timestamp': timestamp,
72             'uploader': uploader,
73             'uploader_id': uploader_id,
74             'comment_count': comment_count,
75             'like_count': like_count,
76             'formats': formats,
77         }