[loc] relax _VALID_URL regex and improve formats extraction
[youtube-dl] / youtube_dl / extractor / libraryofcongress.py
1 # coding: utf-8
2 from __future__ import unicode_literals
3
4 import re
5
6 from .common import InfoExtractor
7
8 from ..utils import (
9     determine_ext,
10     float_or_none,
11     int_or_none,
12     parse_filesize,
13 )
14
15
16 class LibraryOfCongressIE(InfoExtractor):
17     IE_NAME = 'loc'
18     IE_DESC = 'Library of Congress'
19     _VALID_URL = r'https?://(?:www\.)?loc\.gov/(?:item/|today/cyberlc/feature_wdesc\.php\?.*\brec=)(?P<id>[0-9a-z_.]+)'
20     _TESTS = [{
21         # embedded via <div class="media-player"
22         'url': 'http://loc.gov/item/90716351/',
23         'md5': '353917ff7f0255aa6d4b80a034833de8',
24         'info_dict': {
25             'id': '90716351',
26             'ext': 'mp4',
27             'title': "Pa's trip to Mars",
28             'thumbnail': r're:^https?://.*\.jpg$',
29             'duration': 0,
30             'view_count': int,
31         },
32     }, {
33         # webcast embedded via mediaObjectId
34         'url': 'https://www.loc.gov/today/cyberlc/feature_wdesc.php?rec=5578',
35         'info_dict': {
36             'id': '5578',
37             'ext': 'mp4',
38             'title': 'Help! Preservation Training Needs Here, There & Everywhere',
39             'duration': 3765,
40             'view_count': int,
41             'subtitles': 'mincount:1',
42         },
43         'params': {
44             'skip_download': True,
45         },
46     }, {
47         # with direct download links
48         'url': 'https://www.loc.gov/item/78710669/',
49         'info_dict': {
50             'id': '78710669',
51             'ext': 'mp4',
52             'title': 'La vie et la passion de Jesus-Christ',
53             'duration': 0,
54             'view_count': int,
55             'formats': 'mincount:4',
56         },
57         'params': {
58             'skip_download': True,
59         },
60     }, {
61         'url': 'https://www.loc.gov/item/ihas.200197114/',
62         'only_matching': True,
63     }, {
64         'url': 'https://www.loc.gov/item/afc1981005_afs20503/',
65         'only_matching': True,
66     }]
67
68     def _real_extract(self, url):
69         video_id = self._match_id(url)
70         webpage = self._download_webpage(url, video_id)
71
72         media_id = self._search_regex(
73             (r'id=(["\'])media-player-(?P<id>.+?)\1',
74              r'<video[^>]+id=(["\'])uuid-(?P<id>.+?)\1',
75              r'<video[^>]+data-uuid=(["\'])(?P<id>.+?)\1',
76              r'mediaObjectId\s*:\s*(["\'])(?P<id>.+?)\1',
77              r'data-tab="share-media-(?P<id>[0-9A-F]{32})"'),
78             webpage, 'media id', group='id')
79
80         data = self._download_json(
81             'https://media.loc.gov/services/v1/media?id=%s&context=json' % media_id,
82             media_id)['mediaObject']
83
84         derivative = data['derivatives'][0]
85         media_url = derivative['derivativeUrl']
86
87         title = derivative.get('shortName') or data.get('shortName') or self._og_search_title(
88             webpage)
89
90         # Following algorithm was extracted from setAVSource js function
91         # found in webpage
92         media_url = media_url.replace('rtmp', 'https')
93
94         is_video = data.get('mediaType', 'v').lower() == 'v'
95         ext = determine_ext(media_url)
96         if ext not in ('mp4', 'mp3'):
97             media_url += '.mp4' if is_video else '.mp3'
98
99         formats = []
100         if '/vod/mp4:' in media_url:
101             formats.append({
102                 'url': media_url.replace('/vod/mp4:', '/hls-vod/media/') + '.m3u8',
103                 'format_id': 'hls',
104                 'ext': 'mp4',
105                 'protocol': 'm3u8_native',
106                 'quality': 1,
107             })
108         http_format = {
109             'url': re.sub(r'(://[^/]+/)(?:[^/]+/)*(?:mp4|mp3):', r'\1', media_url),
110             'format_id': 'http',
111             'quality': 1,
112         }
113         if not is_video:
114             http_format['vcodec'] = 'none'
115         formats.append(http_format)
116
117         download_urls = set()
118         for m in re.finditer(
119                 r'<option[^>]+value=(["\'])(?P<url>.+?)\1[^>]+data-file-download=[^>]+>\s*(?P<id>.+?)(?:(?:&nbsp;|\s+)\((?P<size>.+?)\))?\s*<', webpage):
120             format_id = m.group('id').lower()
121             if format_id in ('gif', 'jpeg'):
122                 continue
123             download_url = m.group('url')
124             if download_url in download_urls:
125                 continue
126             download_urls.add(download_url)
127             formats.append({
128                 'url': download_url,
129                 'format_id': format_id,
130                 'filesize_approx': parse_filesize(m.group('size')),
131             })
132
133         self._sort_formats(formats)
134
135         duration = float_or_none(data.get('duration'))
136         view_count = int_or_none(data.get('viewCount'))
137
138         subtitles = {}
139         cc_url = data.get('ccUrl')
140         if cc_url:
141             subtitles.setdefault('en', []).append({
142                 'url': cc_url,
143                 'ext': 'ttml',
144             })
145
146         return {
147             'id': video_id,
148             'title': title,
149             'thumbnail': self._og_search_thumbnail(webpage, default=None),
150             'duration': duration,
151             'view_count': view_count,
152             'formats': formats,
153             'subtitles': subtitles,
154         }