[bambuser] Modernize and extract more metadata
[youtube-dl] / youtube_dl / extractor / bambuser.py
1 from __future__ import unicode_literals
2
3 import re
4 import itertools
5
6 from .common import InfoExtractor
7 from ..compat import (
8     compat_urllib_request,
9     compat_str,
10 )
11 from ..utils import (
12     ExtractorError,
13     int_or_none,
14     float_or_none,
15 )
16
17
18 class BambuserIE(InfoExtractor):
19     IE_NAME = 'bambuser'
20     _VALID_URL = r'https?://bambuser\.com/v/(?P<id>\d+)'
21     _API_KEY = '005f64509e19a868399060af746a00aa'
22
23     _TEST = {
24         'url': 'http://bambuser.com/v/4050584',
25         # MD5 seems to be flaky, see https://travis-ci.org/rg3/youtube-dl/jobs/14051016#L388
26         # 'md5': 'fba8f7693e48fd4e8641b3fd5539a641',
27         'info_dict': {
28             'id': '4050584',
29             'ext': 'flv',
30             'title': 'Education engineering days - lightning talks',
31             'duration': 3741,
32             'uploader': 'pixelversity',
33             'uploader_id': '344706',
34             'timestamp': 1382976692,
35             'upload_date': '20131028',
36             'view_count': int,
37         },
38         'params': {
39             # It doesn't respect the 'Range' header, it would download the whole video
40             # caused the travis builds to fail: https://travis-ci.org/rg3/youtube-dl/jobs/14493845#L59
41             'skip_download': True,
42         },
43     }
44
45     def _real_extract(self, url):
46         video_id = self._match_id(url)
47
48         info = self._download_json(
49             'http://player-c.api.bambuser.com/getVideo.json?api_key=%s&vid=%s'
50             % (self._API_KEY, video_id), video_id)
51
52         error = info.get('error')
53         if error:
54             raise ExtractorError(
55                 '%s returned error: %s' % (self.IE_NAME, error), expected=True)
56
57         result = info['result']
58
59         return {
60             'id': video_id,
61             'title': result['title'],
62             'url': result['url'],
63             'thumbnail': result.get('preview'),
64             'duration': int_or_none(result.get('length')),
65             'uploader': result.get('username'),
66             'uploader_id': compat_str(result.get('owner', {}).get('uid')),
67             'timestamp': int_or_none(result.get('created')),
68             'fps': float_or_none(result.get('framerate')),
69             'view_count': int_or_none(result.get('views_total')),
70             'comment_count': int_or_none(result.get('comment_count')),
71         }
72
73
74 class BambuserChannelIE(InfoExtractor):
75     IE_NAME = 'bambuser:channel'
76     _VALID_URL = r'https?://bambuser\.com/channel/(?P<user>.*?)(?:/|#|\?|$)'
77     # The maximum number we can get with each request
78     _STEP = 50
79     _TEST = {
80         'url': 'http://bambuser.com/channel/pixelversity',
81         'info_dict': {
82             'title': 'pixelversity',
83         },
84         'playlist_mincount': 60,
85     }
86
87     def _real_extract(self, url):
88         mobj = re.match(self._VALID_URL, url)
89         user = mobj.group('user')
90         urls = []
91         last_id = ''
92         for i in itertools.count(1):
93             req_url = (
94                 'http://bambuser.com/xhr-api/index.php?username={user}'
95                 '&sort=created&access_mode=0%2C1%2C2&limit={count}'
96                 '&method=broadcast&format=json&vid_older_than={last}'
97             ).format(user=user, count=self._STEP, last=last_id)
98             req = compat_urllib_request.Request(req_url)
99             # Without setting this header, we wouldn't get any result
100             req.add_header('Referer', 'http://bambuser.com/channel/%s' % user)
101             data = self._download_json(
102                 req, user, 'Downloading page %d' % i)
103             results = data['result']
104             if not results:
105                 break
106             last_id = results[-1]['vid']
107             urls.extend(self.url_result(v['page'], 'Bambuser') for v in results)
108
109         return {
110             '_type': 'playlist',
111             'title': user,
112             'entries': urls,
113         }