11 compat_urllib_request,
21 class InfoExtractor(object):
22 """Information Extractor class.
24 Information extractors are the classes that, given a URL, extract
25 information about the video (or videos) the URL refers to. This
26 information includes the real video URL, the video title, author and
27 others. The information is stored in a dictionary which is then
28 passed to the FileDownloader. The FileDownloader processes this
29 information possibly downloading the video to the file system, among
30 other possible outcomes.
32 The dictionaries must include the following fields:
36 title: Video title, unescaped.
37 ext: Video filename extension.
39 Instead of url and ext, formats can also specified.
41 The following fields are optional:
43 format: The video format, defaults to ext (used for --get-format)
44 thumbnails: A list of dictionaries (with the entries "resolution" and
45 "url") for the varying thumbnails
46 thumbnail: Full URL to a video thumbnail image.
47 description: One-line video description.
48 uploader: Full name of the video uploader.
49 upload_date: Video upload date (YYYYMMDD).
50 uploader_id: Nickname or id of the video uploader.
51 location: Physical location of the video.
52 player_url: SWF Player URL (used for rtmpdump).
53 subtitles: The subtitle file contents as a dictionary in the format
54 {language: subtitles}.
55 view_count: How many users have watched the video on the platform.
56 urlhandle: [internal] The urlHandle to be used to download the file,
57 like returned by urllib.request.urlopen
58 age_limit: Age restriction for the video, as an integer (years)
59 formats: A list of dictionaries for each format available, it must
60 be ordered from worst to best quality. Potential fields:
61 * url Mandatory. The URL of the video file
62 * ext Will be calculated from url if missing
63 * format A human-readable description of the format
64 ("mp4 container with h264/opus").
65 Calculated from the format_id, width, height
66 and format_note fields if missing.
67 * format_id A short description of the format
68 ("mp4_h264_opus" or "19")
69 * format_note Additional info about the format
70 ("3D" or "DASH video")
71 * width Width of the video, if known
72 * height Height of the video, if known
74 Unless mentioned otherwise, the fields should be Unicode strings.
76 Subclasses of this one should re-define the _real_initialize() and
77 _real_extract() methods and define a _VALID_URL regexp.
78 Probably, they should also be added to the list of extractors.
80 _real_extract() must return a *list* of information dictionaries as
83 Finally, the _WORKING attribute should be set to False for broken IEs
84 in order to warn the users and skip the tests.
91 def __init__(self, downloader=None):
92 """Constructor. Receives an optional downloader."""
94 self.set_downloader(downloader)
97 def suitable(cls, url):
98 """Receives a URL and returns True if suitable for this IE."""
100 # This does not use has/getattr intentionally - we want to know whether
101 # we have cached the regexp for *this* class, whereas getattr would also
102 # match the superclass
103 if '_VALID_URL_RE' not in cls.__dict__:
104 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
105 return cls._VALID_URL_RE.match(url) is not None
109 """Getter method for _WORKING."""
112 def initialize(self):
113 """Initializes an instance (authentication, etc)."""
115 self._real_initialize()
118 def extract(self, url):
119 """Extracts URL information and returns it in list of dicts."""
121 return self._real_extract(url)
123 def set_downloader(self, downloader):
124 """Sets the downloader for this IE."""
125 self._downloader = downloader
127 def _real_initialize(self):
128 """Real initialization process. Redefine in subclasses."""
131 def _real_extract(self, url):
132 """Real extraction process. Redefine in subclasses."""
137 """A string for getting the InfoExtractor with get_info_extractor"""
138 return cls.__name__[:-2]
142 return type(self).__name__[:-2]
144 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None):
145 """ Returns the response handle """
147 self.report_download_webpage(video_id)
148 elif note is not False:
149 self.to_screen(u'%s: %s' % (video_id, note))
151 return compat_urllib_request.urlopen(url_or_request)
152 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
154 errnote = u'Unable to download webpage'
155 raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2], cause=err)
157 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
158 """ Returns a tuple (page content as string, URL handle) """
160 # Strip hashes from the URL (#1038)
161 if isinstance(url_or_request, (compat_str, str)):
162 url_or_request = url_or_request.partition('#')[0]
164 urlh = self._request_webpage(url_or_request, video_id, note, errnote)
165 content_type = urlh.headers.get('Content-Type', '')
166 webpage_bytes = urlh.read()
167 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
169 encoding = m.group(1)
171 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
172 webpage_bytes[:1024])
174 encoding = m.group(1).decode('ascii')
177 if self._downloader.params.get('dump_intermediate_pages', False):
179 url = url_or_request.get_full_url()
180 except AttributeError:
182 self.to_screen(u'Dumping request to ' + url)
183 dump = base64.b64encode(webpage_bytes).decode('ascii')
184 self._downloader.to_screen(dump)
185 content = webpage_bytes.decode(encoding, 'replace')
186 return (content, urlh)
188 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None):
189 """ Returns the data of the page as a string """
190 return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0]
192 def to_screen(self, msg):
193 """Print msg to screen, prefixing it with '[ie_name]'"""
194 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
196 def report_extraction(self, id_or_name):
197 """Report information extraction."""
198 self.to_screen(u'%s: Extracting information' % id_or_name)
200 def report_download_webpage(self, video_id):
201 """Report webpage download."""
202 self.to_screen(u'%s: Downloading webpage' % video_id)
204 def report_age_confirmation(self):
205 """Report attempt to confirm age."""
206 self.to_screen(u'Confirming age')
208 def report_login(self):
209 """Report attempt to log in."""
210 self.to_screen(u'Logging in')
212 #Methods for following #608
213 def url_result(self, url, ie=None):
214 """Returns a url that points to a page that should be processed"""
215 #TODO: ie should be the class used for getting the info
216 video_info = {'_type': 'url',
220 def playlist_result(self, entries, playlist_id=None, playlist_title=None):
221 """Returns a playlist"""
222 video_info = {'_type': 'playlist',
225 video_info['id'] = playlist_id
227 video_info['title'] = playlist_title
230 def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
232 Perform a regex search on the given string, using a single or a list of
233 patterns returning the first matching group.
234 In case of failure return a default value or raise a WARNING or a
235 RegexNotFoundError, depending on fatal, specifying the field name.
237 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
238 mobj = re.search(pattern, string, flags)
241 mobj = re.search(p, string, flags)
244 if sys.stderr.isatty() and os.name != 'nt':
245 _name = u'\033[0;34m%s\033[0m' % name
250 # return the first matching group
251 return next(g for g in mobj.groups() if g is not None)
252 elif default is not None:
255 raise RegexNotFoundError(u'Unable to extract %s' % _name)
257 self._downloader.report_warning(u'unable to extract %s; '
258 u'please report this issue on http://yt-dl.org/bug' % _name)
261 def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0):
263 Like _search_regex, but strips HTML tags and unescapes entities.
265 res = self._search_regex(pattern, string, name, default, fatal, flags)
267 return clean_html(res).strip()
271 def _get_login_info(self):
273 Get the the login info as (username, password)
274 It will look in the netrc file using the _NETRC_MACHINE value
275 If there's no info available, return (None, None)
277 if self._downloader is None:
282 downloader_params = self._downloader.params
284 # Attempt to use provided username and password or .netrc data
285 if downloader_params.get('username', None) is not None:
286 username = downloader_params['username']
287 password = downloader_params['password']
288 elif downloader_params.get('usenetrc', False):
290 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
295 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
296 except (IOError, netrc.NetrcParseError) as err:
297 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
299 return (username, password)
301 # Helper functions for extracting OpenGraph info
304 return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop)
306 def _og_search_property(self, prop, html, name=None, **kargs):
308 name = 'OpenGraph %s' % prop
309 escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs)
310 return unescapeHTML(escaped)
312 def _og_search_thumbnail(self, html, **kargs):
313 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
315 def _og_search_description(self, html, **kargs):
316 return self._og_search_property('description', html, fatal=False, **kargs)
318 def _og_search_title(self, html, **kargs):
319 return self._og_search_property('title', html, **kargs)
321 def _og_search_video_url(self, html, name='video url', **kargs):
322 return self._html_search_regex([self._og_regex('video:secure_url'),
323 self._og_regex('video')],
326 def _rta_search(self, html):
327 # See http://www.rtalabel.org/index.php?content=howtofaq#single
328 if re.search(r'(?ix)<meta\s+name="rating"\s+'
329 r' content="RTA-5042-1996-1400-1577-RTA"',
335 class SearchInfoExtractor(InfoExtractor):
337 Base class for paged search queries extractors.
338 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
339 Instances should define _SEARCH_KEY and _MAX_RESULTS.
343 def _make_valid_url(cls):
344 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
347 def suitable(cls, url):
348 return re.match(cls._make_valid_url(), url) is not None
350 def _real_extract(self, query):
351 mobj = re.match(self._make_valid_url(), query)
353 raise ExtractorError(u'Invalid search query "%s"' % query)
355 prefix = mobj.group('prefix')
356 query = mobj.group('query')
358 return self._get_n_results(query, 1)
359 elif prefix == 'all':
360 return self._get_n_results(query, self._MAX_RESULTS)
364 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
365 elif n > self._MAX_RESULTS:
366 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
367 n = self._MAX_RESULTS
368 return self._get_n_results(query, n)
370 def _get_n_results(self, query, n):
371 """Get a specified number of results for a query"""
372 raise NotImplementedError("This method must be implemented by subclasses")
375 def SEARCH_KEY(self):
376 return self._SEARCH_KEY