_ Git - youtube-dl/blob - youtube_dl/extractor/rutube.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import re
   5 import itertools
   6
   7 from .common import InfoExtractor
   8 from ..compat import (
   9     compat_str,
  10     compat_parse_qs,
  11     compat_urllib_parse_urlparse,
  12 )
  13 from ..utils import (
  14     determine_ext,
  15     unified_strdate,
  16     try_get,
  17     int_or_none,
  18 )
  19
  20
  21 class RutubeIE(InfoExtractor):
  22     IE_NAME = 'rutube'
  23     IE_DESC = 'Rutube videos'
  24     _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/(?P<id>[\da-z]{32})'
  25
  26     _TESTS = [{
  27         'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/',
  28         'info_dict': {
  29             'id': '3eac3b4561676c17df9132a9a1e62e3e',
  30             'ext': 'mp4',
  31             'title': 'Раненный кенгуру забежал в аптеку',
  32             'description': 'http://www.ntdtv.ru ',
  33             'duration': 80,
  34             'uploader': 'NTDRussian',
  35             'uploader_id': '29790',
  36             'upload_date': '20131016',
  37             'age_limit': 0,
  38         },
  39         'params': {
  40             # It requires ffmpeg (m3u8 download)
  41             'skip_download': True,
  42         },
  43     }, {
  44         'url': 'http://rutube.ru/play/embed/a10e53b86e8f349080f718582ce4c661',
  45         'only_matching': True,
  46     }, {
  47         'url': 'http://rutube.ru/embed/a10e53b86e8f349080f718582ce4c661',
  48         'only_matching': True,
  49     }, {
  50         'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/?pl_id=4252',
  51         'only_matching': True,
  52     }]
  53
  54     @classmethod
  55     def suitable(cls, url):
  56         parts = compat_urllib_parse_urlparse(url)
  57         params = compat_parse_qs(parts.query)
  58
  59         # see if URL without parameters is OK
  60         res = super(RutubeIE, cls).suitable(url)
  61
  62         if params:  # we only allow pl_id parameter in the url
  63             res = res and 'pl_id' in params and len(params) == 1
  64
  65         return res
  66
  67     @staticmethod
  68     def _extract_urls(webpage):
  69         return [mobj.group('url') for mobj in re.finditer(
  70             r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//rutube\.ru/embed/[\da-z]{32}.*?)\1',
  71             webpage)]
  72
  73     def _real_extract(self, url):
  74         video_id = self._match_id(url)
  75         video = self._download_json(
  76             'http://rutube.ru/api/video/%s/?format=json' % video_id,
  77             video_id, 'Downloading video JSON')
  78
  79         # Some videos don't have the author field
  80         author = video.get('author') or {}
  81
  82         options = self._download_json(
  83             'http://rutube.ru/api/play/options/%s/?format=json' % video_id,
  84             video_id, 'Downloading options JSON')
  85
  86         formats = []
  87         for format_id, format_url in options['video_balancer'].items():
  88             ext = determine_ext(format_url)
  89             if ext == 'm3u8':
  90                 formats.extend(self._extract_m3u8_formats(
  91                     format_url, video_id, 'mp4', m3u8_id=format_id, fatal=False))
  92             elif ext == 'f4m':
  93                 formats.extend(self._extract_f4m_formats(
  94                     format_url, video_id, f4m_id=format_id, fatal=False))
  95             else:
  96                 formats.append({
  97                     'url': format_url,
  98                     'format_id': format_id,
  99                 })
 100         self._sort_formats(formats)
 101
 102         return {
 103             'id': video['id'],
 104             'title': video['title'],
 105             'description': video['description'],
 106             'duration': video['duration'],
 107             'view_count': video['hits'],
 108             'formats': formats,
 109             'thumbnail': video['thumbnail_url'],
 110             'uploader': author.get('name'),
 111             'uploader_id': compat_str(author['id']) if author else None,
 112             'upload_date': unified_strdate(video['created_ts']),
 113             'age_limit': 18 if video['is_adult'] else 0,
 114         }
 115
 116
 117 class RutubeEmbedIE(InfoExtractor):
 118     IE_NAME = 'rutube:embed'
 119     IE_DESC = 'Rutube embedded videos'
 120     _VALID_URL = r'https?://rutube\.ru/(?:video|play)/embed/(?P<id>[0-9]+)'
 121
 122     _TESTS = [{
 123         'url': 'http://rutube.ru/video/embed/6722881?vk_puid37=&vk_puid38=',
 124         'info_dict': {
 125             'id': 'a10e53b86e8f349080f718582ce4c661',
 126             'ext': 'mp4',
 127             'upload_date': '20131223',
 128             'uploader_id': '297833',
 129             'description': 'Видео группы ★http://vk.com/foxkidsreset★ музей Fox Kids и Jetix<br/><br/> восстановлено и сделано в шикоформате subziro89 http://vk.com/subziro89',
 130             'uploader': 'subziro89 ILya',
 131             'title': 'Мистический городок Эйри в Индиан 5 серия озвучка subziro89',
 132         },
 133         'params': {
 134             'skip_download': 'Requires ffmpeg',
 135         },
 136     }, {
 137         'url': 'http://rutube.ru/play/embed/8083783',
 138         'only_matching': True,
 139     }]
 140
 141     def _real_extract(self, url):
 142         embed_id = self._match_id(url)
 143         webpage = self._download_webpage(url, embed_id)
 144
 145         canonical_url = self._html_search_regex(
 146             r'<link\s+rel="canonical"\s+href="([^"]+?)"', webpage,
 147             'Canonical URL')
 148         return self.url_result(canonical_url, 'Rutube')
 149
 150
 151 class RutubeChannelIE(InfoExtractor):
 152     IE_NAME = 'rutube:channel'
 153     IE_DESC = 'Rutube channels'
 154     _VALID_URL = r'https?://rutube\.ru/tags/video/(?P<id>\d+)'
 155     _TESTS = [{
 156         'url': 'http://rutube.ru/tags/video/1800/',
 157         'info_dict': {
 158             'id': '1800',
 159         },
 160         'playlist_mincount': 68,
 161     }]
 162
 163     _PAGE_TEMPLATE = 'http://rutube.ru/api/tags/video/%s/?page=%s&format=json'
 164
 165     def _extract_videos(self, channel_id, channel_title=None):
 166         entries = []
 167         for pagenum in itertools.count(1):
 168             page = self._download_json(
 169                 self._PAGE_TEMPLATE % (channel_id, pagenum),
 170                 channel_id, 'Downloading page %s' % pagenum)
 171             results = page['results']
 172             if not results:
 173                 break
 174             entries.extend(self.url_result(result['video_url'], 'Rutube') for result in results)
 175             if not page['has_next']:
 176                 break
 177         return self.playlist_result(entries, channel_id, channel_title)
 178
 179     def _real_extract(self, url):
 180         mobj = re.match(self._VALID_URL, url)
 181         channel_id = mobj.group('id')
 182         return self._extract_videos(channel_id)
 183
 184
 185 class RutubeMovieIE(RutubeChannelIE):
 186     IE_NAME = 'rutube:movie'
 187     IE_DESC = 'Rutube movies'
 188     _VALID_URL = r'https?://rutube\.ru/metainfo/tv/(?P<id>\d+)'
 189     _TESTS = []
 190
 191     _MOVIE_TEMPLATE = 'http://rutube.ru/api/metainfo/tv/%s/?format=json'
 192     _PAGE_TEMPLATE = 'http://rutube.ru/api/metainfo/tv/%s/video?page=%s&format=json'
 193
 194     def _real_extract(self, url):
 195         movie_id = self._match_id(url)
 196         movie = self._download_json(
 197             self._MOVIE_TEMPLATE % movie_id, movie_id,
 198             'Downloading movie JSON')
 199         movie_name = movie['name']
 200         return self._extract_videos(movie_id, movie_name)
 201
 202
 203 class RutubePersonIE(RutubeChannelIE):
 204     IE_NAME = 'rutube:person'
 205     IE_DESC = 'Rutube person videos'
 206     _VALID_URL = r'https?://rutube\.ru/video/person/(?P<id>\d+)'
 207     _TESTS = [{
 208         'url': 'http://rutube.ru/video/person/313878/',
 209         'info_dict': {
 210             'id': '313878',
 211         },
 212         'playlist_mincount': 37,
 213     }]
 214
 215     _PAGE_TEMPLATE = 'http://rutube.ru/api/video/person/%s/?page=%s&format=json'
 216
 217
 218 class RutubePlaylistIE(InfoExtractor):
 219     IE_NAME = 'rutube:playlist'
 220     IE_DESC = 'Rutube playlists'
 221     _TESTS = [{
 222         'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_id=4252&pl_type=source',
 223         'info_dict': {
 224             'id': '4252',
 225         },
 226         'playlist_count': 25,
 227     }]
 228
 229     _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/[\da-z]{32}/\?(?:.+)?pl_id=(?P<id>\d+)'
 230     _PAGE_TEMPLATE = 'http://rutube.ru/api/playlist/source/%s/?page=%s'
 231
 232     @staticmethod
 233     def suitable(url):
 234         params = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
 235         return params.get('pl_id') and int_or_none(params['pl_id'][0]) \
 236             and params.get('pl_type')
 237
 238     def _real_extract(self, url):
 239         playlist_id = self._match_id(url)
 240         return self._extract_playlist(playlist_id)
 241
 242     def _extract_playlist(self, playlist_id):
 243         entries = []
 244         for pagenum in itertools.count(1):
 245             page_url = self._PAGE_TEMPLATE % (playlist_id, pagenum)
 246
 247             # download_json will sent an accept: application/xml header
 248             page = self._download_json(page_url, playlist_id,
 249                                        "Downloading metadata for page %s" % pagenum,
 250                                        headers={'Accept': 'application/json'})
 251
 252             if not page['results']:
 253                 break
 254
 255             results = page['results']
 256             for result in results:
 257                 entry = self.url_result(result.get('video_url'), 'Rutube')
 258                 category = try_get(result, lambda x: x['category']['name'])
 259                 entry.update({
 260                     'id': result.get('id'),
 261                     'uploader': try_get(result, lambda x: x['author']['name']),
 262                     'uploader_id': try_get(result, lambda x: x['author']['id']),
 263                     'upload_date': unified_strdate(result.get('created_ts')),
 264                     'title': result.get('title'),
 265                     'description': result.get('description'),
 266                     'thumbnail': result.get('thumbnail_url'),
 267                     'duration': int_or_none(result.get('duration')),
 268                     'category': [category] if category else None,
 269                     'age_limit': 18 if result.get('is_adult') else 0,
 270                     'view_count': int_or_none(result.get('hits')),
 271                     'is_live': result.get('is_livestream'),
 272                     'webpage_url': result.get('video_url'),
 273                 })
 274                 entries.append(entry)
 275
 276             if page['has_next'] is False:
 277                 break
 278
 279         return self.playlist_result(entries, playlist_id, page['name'])