[viki:channel] Extract title from JSON
[youtube-dl] / youtube_dl / extractor / vulture.py
1 from __future__ import unicode_literals
2
3 import json
4 import os.path
5 import re
6
7 from .common import InfoExtractor
8 from ..utils import (
9     int_or_none,
10     parse_iso8601,
11 )
12
13
14 class VultureIE(InfoExtractor):
15     IE_NAME = 'vulture.com'
16     _VALID_URL = r'https?://video\.vulture\.com/video/(?P<display_id>[^/]+)/'
17     _TEST = {
18         'url': 'http://video.vulture.com/video/Mindy-Kaling-s-Harvard-Speech/player?layout=compact&read_more=1',
19         'md5': '8d997845642a2b5152820f7257871bc8',
20         'info_dict': {
21             'id': '6GHRQL3RV7MSD1H4',
22             'ext': 'mp4',
23             'title': 'kaling-speech-2-MAGNIFY STANDARD CONTAINER REVISED',
24             'uploader_id': 'Sarah',
25             'thumbnail': 're:^http://.*\.jpg$',
26             'timestamp': 1401288564,
27             'upload_date': '20140528',
28             'description': 'Uplifting and witty, as predicted.',
29             'duration': 1015,
30         }
31     }
32
33     def _real_extract(self, url):
34         mobj = re.match(self._VALID_URL, url)
35         display_id = mobj.group('display_id')
36
37         webpage = self._download_webpage(url, display_id)
38         query_string = self._search_regex(
39             r"queryString\s*=\s*'([^']+)'", webpage, 'query string')
40         video_id = self._search_regex(
41             r'content=([^&]+)', query_string, 'video ID')
42         query_url = 'http://video.vulture.com/embed/player/container/1000/1000/?%s' % query_string
43
44         query_webpage = self._download_webpage(
45             query_url, display_id, note='Downloading query page')
46         params_json = self._search_regex(
47             r'(?sm)new MagnifyEmbeddablePlayer\({.*?contentItem:\s*(\{.*?\})\n,\n',
48             query_webpage,
49             'player params')
50         params = json.loads(params_json)
51
52         upload_timestamp = parse_iso8601(params['posted'].replace(' ', 'T'))
53         uploader_id = params.get('user', {}).get('handle')
54
55         media_item = params['media_item']
56         title = os.path.splitext(media_item['title'])[0]
57         duration = int_or_none(media_item.get('duration_seconds'))
58
59         return {
60             'id': video_id,
61             'display_id': display_id,
62             'url': media_item['pipeline_xid'],
63             'title': title,
64             'timestamp': upload_timestamp,
65             'thumbnail': params.get('thumbnail_url'),
66             'uploader_id': uploader_id,
67             'description': params.get('description'),
68             'duration': duration,
69         }