[tiktok] Add extractor (closes #18108)
[youtube-dl] / youtube_dl / extractor / tiktok.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 from .common import InfoExtractor
5 from ..utils import (
6     compat_str,
7     int_or_none,
8     str_or_none,
9     try_get,
10     url_or_none,
11 )
12
13
14 class TikTokIE(InfoExtractor):
15     _VALID_URL = r'https?://(?:m\.)?tiktok\.com/v/(?P<id>[0-9]+)'
16     _TEST = {
17         'url': 'https://m.tiktok.com/v/6606727368545406213.html',
18         'md5': 'd584b572e92fcd48888051f238022420',
19         'info_dict': {
20             'id': '6606727368545406213',
21             'ext': 'mp4',
22             'title': 'Zureeal on TikTok',
23             'thumbnail': r're:^https?://.*~noop.image',
24             'description': '#bowsette#mario#cosplay#uk#lgbt#gaming#asian#bowsettecosplay',
25             'uploader': 'Zureeal',
26             'width': 540,
27             'height': 960,
28         }
29     }
30
31     def _real_extract(self, url):
32         video_id = self._match_id(url)
33         webpage = self._download_webpage(url, video_id)
34
35         data = self._parse_json(
36             self._search_regex(
37                 r'var\s+data\s*=\s*({.+?});', webpage, 'data'
38             ), video_id)
39
40         title = self._og_search_title(webpage)
41
42         description = str_or_none(try_get(data, lambda x: x['desc']))
43         width = int_or_none(try_get(data, lambda x: x['video']['width']))
44         height = int_or_none(try_get(data, lambda x: x['video']['height']))
45
46         formats = []
47
48         for count, (key, label) in enumerate((('play_addr_lowbr', 'Low'), ('play_addr', 'Normal'), ('download_addr', 'Download')), -2):
49             for format in try_get(data, lambda x: x['video'][key]['url_list']):
50                 format_url = url_or_none(format)
51                 if not format_url:
52                     continue
53                 formats.append({
54                     'url': format_url,
55                     'ext': 'mp4',
56                     'height': height,
57                     'width': width,
58                     'format_note': label,
59                     'quality': count
60                 })
61
62         self._sort_formats(formats)
63
64         uploader = try_get(data, lambda x: x['author']['nickname'], compat_str)
65
66         thumbnail = url_or_none(
67             try_get(
68                 data, lambda x: x['video']['cover']['url_list'][0], compat_str))
69
70         return {
71             'id': video_id,
72             'title': title,
73             'description': description,
74             'uploader': uploader,
75             'formats': formats,
76             'thumbnail': thumbnail,
77             'width': width,
78             'height': height,
79         }