[youtube] Fix extraction.
[youtube-dl] / youtube_dl / extractor / libraryofcongress.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import re
5
6 from .common import InfoExtractor
7
8 from ..utils import (
9     determine_ext,
10     float_or_none,
11     int_or_none,
12     parse_filesize,
13 )
14
15
16 class LibraryOfCongressIE(InfoExtractor):
17     IE_NAME = 'loc'
18     IE_DESC = 'Library of Congress'
19     _VALID_URL = r'https?://(?:www\.)?loc\.gov/(?:item/|today/cyberlc/feature_wdesc\.php\?.*\brec=)(?P<id>[0-9a-z_.]+)'
20     _TESTS = [{
21         # embedded via <div class="media-player"
22         'url': 'http://loc.gov/item/90716351/',
23         'md5': '6ec0ae8f07f86731b1b2ff70f046210a',
24         'info_dict': {
25             'id': '90716351',
26             'ext': 'mp4',
27             'title': "Pa's trip to Mars",
28             'duration': 0,
29             'view_count': int,
30         },
31     }, {
32         # webcast embedded via mediaObjectId
33         'url': 'https://www.loc.gov/today/cyberlc/feature_wdesc.php?rec=5578',
34         'info_dict': {
35             'id': '5578',
36             'ext': 'mp4',
37             'title': 'Help! Preservation Training Needs Here, There & Everywhere',
38             'duration': 3765,
39             'view_count': int,
40             'subtitles': 'mincount:1',
41         },
42         'params': {
43             'skip_download': True,
44         },
45     }, {
46         # with direct download links
47         'url': 'https://www.loc.gov/item/78710669/',
48         'info_dict': {
49             'id': '78710669',
50             'ext': 'mp4',
51             'title': 'La vie et la passion de Jesus-Christ',
52             'duration': 0,
53             'view_count': int,
54             'formats': 'mincount:4',
55         },
56         'params': {
57             'skip_download': True,
58         },
59     }, {
60         'url': 'https://www.loc.gov/item/ihas.200197114/',
61         'only_matching': True,
62     }, {
63         'url': 'https://www.loc.gov/item/afc1981005_afs20503/',
64         'only_matching': True,
65     }]
66
67     def _real_extract(self, url):
68         video_id = self._match_id(url)
69         webpage = self._download_webpage(url, video_id)
70
71         media_id = self._search_regex(
72             (r'id=(["\'])media-player-(?P<id>.+?)\1',
73              r'<video[^>]+id=(["\'])uuid-(?P<id>.+?)\1',
74              r'<video[^>]+data-uuid=(["\'])(?P<id>.+?)\1',
75              r'mediaObjectId\s*:\s*(["\'])(?P<id>.+?)\1',
76              r'data-tab="share-media-(?P<id>[0-9A-F]{32})"'),
77             webpage, 'media id', group='id')
78
79         data = self._download_json(
80             'https://media.loc.gov/services/v1/media?id=%s&context=json' % media_id,
81             media_id)['mediaObject']
82
83         derivative = data['derivatives'][0]
84         media_url = derivative['derivativeUrl']
85
86         title = derivative.get('shortName') or data.get('shortName') or self._og_search_title(
87             webpage)
88
89         # Following algorithm was extracted from setAVSource js function
90         # found in webpage
91         media_url = media_url.replace('rtmp', 'https')
92
93         is_video = data.get('mediaType', 'v').lower() == 'v'
94         ext = determine_ext(media_url)
95         if ext not in ('mp4', 'mp3'):
96             media_url += '.mp4' if is_video else '.mp3'
97
98         formats = []
99         if '/vod/mp4:' in media_url:
100             formats.append({
101                 'url': media_url.replace('/vod/mp4:', '/hls-vod/media/') + '.m3u8',
102                 'format_id': 'hls',
103                 'ext': 'mp4',
104                 'protocol': 'm3u8_native',
105                 'quality': 1,
106             })
107         http_format = {
108             'url': re.sub(r'(://[^/]+/)(?:[^/]+/)*(?:mp4|mp3):', r'\1', media_url),
109             'format_id': 'http',
110             'quality': 1,
111         }
112         if not is_video:
113             http_format['vcodec'] = 'none'
114         formats.append(http_format)
115
116         download_urls = set()
117         for m in re.finditer(
118                 r'<option[^>]+value=(["\'])(?P<url>.+?)\1[^>]+data-file-download=[^>]+>\s*(?P<id>.+?)(?:(?:&nbsp;|\s+)\((?P<size>.+?)\))?\s*<', webpage):
119             format_id = m.group('id').lower()
120             if format_id in ('gif', 'jpeg'):
121                 continue
122             download_url = m.group('url')
123             if download_url in download_urls:
124                 continue
125             download_urls.add(download_url)
126             formats.append({
127                 'url': download_url,
128                 'format_id': format_id,
129                 'filesize_approx': parse_filesize(m.group('size')),
130             })
131
132         self._sort_formats(formats)
133
134         duration = float_or_none(data.get('duration'))
135         view_count = int_or_none(data.get('viewCount'))
136
137         subtitles = {}
138         cc_url = data.get('ccUrl')
139         if cc_url:
140             subtitles.setdefault('en', []).append({
141                 'url': cc_url,
142                 'ext': 'ttml',
143             })
144
145         return {
146             'id': video_id,
147             'title': title,
148             'thumbnail': self._og_search_thumbnail(webpage, default=None),
149             'duration': duration,
150             'view_count': view_count,
151             'formats': formats,
152             'subtitles': subtitles,
153         }