7 import xml.etree.ElementTree
12 compat_urllib_parse_urlparse,
22 _NO_DEFAULT = object()
25 class InfoExtractor(object):
26 """Information Extractor class.
28 Information extractors are the classes that, given a URL, extract
29 information about the video (or videos) the URL refers to. This
30 information includes the real video URL, the video title, author and
31 others. The information is stored in a dictionary which is then
32 passed to the FileDownloader. The FileDownloader processes this
33 information possibly downloading the video to the file system, among
34 other possible outcomes.
36 The dictionaries must include the following fields:
39 title: Video title, unescaped.
41 Additionally, it must contain either a formats entry or a url one:
43 formats: A list of dictionaries for each format available, ordered
44 from worst to best quality.
47 * url Mandatory. The URL of the video file
48 * ext Will be calculated from url if missing
49 * format A human-readable description of the format
50 ("mp4 container with h264/opus").
51 Calculated from the format_id, width, height.
52 and format_note fields if missing.
53 * format_id A short description of the format
54 ("mp4_h264_opus" or "19").
55 Technically optional, but strongly recommended.
56 * format_note Additional info about the format
57 ("3D" or "DASH video")
58 * width Width of the video, if known
59 * height Height of the video, if known
60 * resolution Textual description of width and height
61 * tbr Average bitrate of audio and video in KBit/s
62 * abr Average audio bitrate in KBit/s
63 * acodec Name of the audio codec in use
64 * vbr Average video bitrate in KBit/s
65 * vcodec Name of the video codec in use
66 * filesize The number of bytes, if known in advance
67 * player_url SWF Player URL (used for rtmpdump).
68 * protocol The protocol that will be used for the actual
70 "http", "https", "rtsp", "rtmp" or so.
71 * preference Order number of this format. If this field is
72 present and not None, the formats get sorted
74 -1 for default (order by other properties),
75 -2 or smaller for less than default.
76 * quality Order number of the video quality of this
77 format, irrespective of the file format.
78 -1 for default (order by other properties),
79 -2 or smaller for less than default.
81 ext: Video filename extension.
82 format: The video format, defaults to ext (used for --get-format)
83 player_url: SWF Player URL (used for rtmpdump).
85 The following fields are optional:
87 thumbnails: A list of dictionaries (with the entries "resolution" and
88 "url") for the varying thumbnails
89 thumbnail: Full URL to a video thumbnail image.
90 description: One-line video description.
91 uploader: Full name of the video uploader.
92 upload_date: Video upload date (YYYYMMDD).
93 uploader_id: Nickname or id of the video uploader.
94 location: Physical location of the video.
95 subtitles: The subtitle file contents as a dictionary in the format
96 {language: subtitles}.
97 duration: Length of the video in seconds, as an integer.
98 view_count: How many users have watched the video on the platform.
99 like_count: Number of positive ratings of the video
100 dislike_count: Number of negative ratings of the video
101 comment_count: Number of comments on the video
102 age_limit: Age restriction for the video, as an integer (years)
103 webpage_url: The url to the video webpage, if given to youtube-dl it
104 should allow to get the same result again. (It will be set
105 by YoutubeDL if it's missing)
107 Unless mentioned otherwise, the fields should be Unicode strings.
109 Subclasses of this one should re-define the _real_initialize() and
110 _real_extract() methods and define a _VALID_URL regexp.
111 Probably, they should also be added to the list of extractors.
113 _real_extract() must return a *list* of information dictionaries as
116 Finally, the _WORKING attribute should be set to False for broken IEs
117 in order to warn the users and skip the tests.
124 def __init__(self, downloader=None):
125 """Constructor. Receives an optional downloader."""
127 self.set_downloader(downloader)
130 def suitable(cls, url):
131 """Receives a URL and returns True if suitable for this IE."""
133 # This does not use has/getattr intentionally - we want to know whether
134 # we have cached the regexp for *this* class, whereas getattr would also
135 # match the superclass
136 if '_VALID_URL_RE' not in cls.__dict__:
137 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
138 return cls._VALID_URL_RE.match(url) is not None
142 """Getter method for _WORKING."""
145 def initialize(self):
146 """Initializes an instance (authentication, etc)."""
148 self._real_initialize()
151 def extract(self, url):
152 """Extracts URL information and returns it in list of dicts."""
154 return self._real_extract(url)
156 def set_downloader(self, downloader):
157 """Sets the downloader for this IE."""
158 self._downloader = downloader
160 def _real_initialize(self):
161 """Real initialization process. Redefine in subclasses."""
164 def _real_extract(self, url):
165 """Real extraction process. Redefine in subclasses."""
170 """A string for getting the InfoExtractor with get_info_extractor"""
171 return cls.__name__[:-2]
175 return type(self).__name__[:-2]
177 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
178 """ Returns the response handle """
180 self.report_download_webpage(video_id)
181 elif note is not False:
183 self.to_screen(u'%s' % (note,))
185 self.to_screen(u'%s: %s' % (video_id, note))
187 return self._downloader.urlopen(url_or_request)
188 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
192 errnote = u'Unable to download webpage'
193 errmsg = u'%s: %s' % (errnote, compat_str(err))
195 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
197 self._downloader.report_warning(errmsg)
200 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
201 """ Returns a tuple (page content as string, URL handle) """
203 # Strip hashes from the URL (#1038)
204 if isinstance(url_or_request, (compat_str, str)):
205 url_or_request = url_or_request.partition('#')[0]
207 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
211 content_type = urlh.headers.get('Content-Type', '')
212 webpage_bytes = urlh.read()
213 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
215 encoding = m.group(1)
217 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
218 webpage_bytes[:1024])
220 encoding = m.group(1).decode('ascii')
223 if self._downloader.params.get('dump_intermediate_pages', False):
225 url = url_or_request.get_full_url()
226 except AttributeError:
228 self.to_screen(u'Dumping request to ' + url)
229 dump = base64.b64encode(webpage_bytes).decode('ascii')
230 self._downloader.to_screen(dump)
231 if self._downloader.params.get('write_pages', False):
233 url = url_or_request.get_full_url()
234 except AttributeError:
236 raw_filename = ('%s_%s.dump' % (video_id, url))
237 filename = sanitize_filename(raw_filename, restricted=True)
238 self.to_screen(u'Saving request to ' + filename)
239 with open(filename, 'wb') as outf:
240 outf.write(webpage_bytes)
242 content = webpage_bytes.decode(encoding, 'replace')
243 return (content, urlh)
245 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
246 """ Returns the data of the page as a string """
247 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
254 def _download_xml(self, url_or_request, video_id,
255 note=u'Downloading XML', errnote=u'Unable to download XML',
256 transform_source=None):
257 """Return the xml as an xml.etree.ElementTree.Element"""
258 xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
260 xml_string = transform_source(xml_string)
261 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
263 def report_warning(self, msg, video_id=None):
264 idstr = u'' if video_id is None else u'%s: ' % video_id
265 self._downloader.report_warning(
266 u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
268 def to_screen(self, msg):
269 """Print msg to screen, prefixing it with '[ie_name]'"""
270 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
272 def report_extraction(self, id_or_name):
273 """Report information extraction."""
274 self.to_screen(u'%s: Extracting information' % id_or_name)
276 def report_download_webpage(self, video_id):
277 """Report webpage download."""
278 self.to_screen(u'%s: Downloading webpage' % video_id)
280 def report_age_confirmation(self):
281 """Report attempt to confirm age."""
282 self.to_screen(u'Confirming age')
284 def report_login(self):
285 """Report attempt to log in."""
286 self.to_screen(u'Logging in')
288 #Methods for following #608
290 def url_result(url, ie=None, video_id=None):
291 """Returns a url that points to a page that should be processed"""
292 #TODO: ie should be the class used for getting the info
293 video_info = {'_type': 'url',
296 if video_id is not None:
297 video_info['id'] = video_id
300 def playlist_result(entries, playlist_id=None, playlist_title=None):
301 """Returns a playlist"""
302 video_info = {'_type': 'playlist',
305 video_info['id'] = playlist_id
307 video_info['title'] = playlist_title
310 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
312 Perform a regex search on the given string, using a single or a list of
313 patterns returning the first matching group.
314 In case of failure return a default value or raise a WARNING or a
315 RegexNotFoundError, depending on fatal, specifying the field name.
317 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
318 mobj = re.search(pattern, string, flags)
321 mobj = re.search(p, string, flags)
324 if os.name != 'nt' and sys.stderr.isatty():
325 _name = u'\033[0;34m%s\033[0m' % name
330 # return the first matching group
331 return next(g for g in mobj.groups() if g is not None)
332 elif default is not _NO_DEFAULT:
335 raise RegexNotFoundError(u'Unable to extract %s' % _name)
337 self._downloader.report_warning(u'unable to extract %s; '
338 u'please report this issue on http://yt-dl.org/bug' % _name)
341 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
343 Like _search_regex, but strips HTML tags and unescapes entities.
345 res = self._search_regex(pattern, string, name, default, fatal, flags)
347 return clean_html(res).strip()
351 def _get_login_info(self):
353 Get the the login info as (username, password)
354 It will look in the netrc file using the _NETRC_MACHINE value
355 If there's no info available, return (None, None)
357 if self._downloader is None:
362 downloader_params = self._downloader.params
364 # Attempt to use provided username and password or .netrc data
365 if downloader_params.get('username', None) is not None:
366 username = downloader_params['username']
367 password = downloader_params['password']
368 elif downloader_params.get('usenetrc', False):
370 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
375 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
376 except (IOError, netrc.NetrcParseError) as err:
377 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
379 return (username, password)
381 # Helper functions for extracting OpenGraph info
383 def _og_regexes(prop):
384 content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
385 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
386 template = r'<meta[^>]+?%s[^>]+?%s'
388 template % (property_re, content_re),
389 template % (content_re, property_re),
392 def _og_search_property(self, prop, html, name=None, **kargs):
394 name = 'OpenGraph %s' % prop
395 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
398 return unescapeHTML(escaped)
400 def _og_search_thumbnail(self, html, **kargs):
401 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
403 def _og_search_description(self, html, **kargs):
404 return self._og_search_property('description', html, fatal=False, **kargs)
406 def _og_search_title(self, html, **kargs):
407 return self._og_search_property('title', html, **kargs)
409 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
410 regexes = self._og_regexes('video')
411 if secure: regexes = self._og_regexes('video:secure_url') + regexes
412 return self._html_search_regex(regexes, html, name, **kargs)
414 def _html_search_meta(self, name, html, display_name=None):
415 if display_name is None:
417 return self._html_search_regex(
419 (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
420 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
421 html, display_name, fatal=False)
423 def _dc_search_uploader(self, html):
424 return self._html_search_meta('dc.creator', html, 'uploader')
426 def _rta_search(self, html):
427 # See http://www.rtalabel.org/index.php?content=howtofaq#single
428 if re.search(r'(?ix)<meta\s+name="rating"\s+'
429 r' content="RTA-5042-1996-1400-1577-RTA"',
434 def _media_rating_search(self, html):
435 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
436 rating = self._html_search_meta('rating', html)
448 return RATING_TABLE.get(rating.lower(), None)
450 def _sort_formats(self, formats):
452 # TODO remove the following workaround
453 from ..utils import determine_ext
454 if not f.get('ext') and 'url' in f:
455 f['ext'] = determine_ext(f['url'])
457 preference = f.get('preference')
458 if preference is None:
459 proto = f.get('protocol')
461 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
463 preference = 0 if proto in ['http', 'https'] else -0.1
464 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
467 if f.get('vcodec') == 'none': # audio only
468 if self._downloader.params.get('prefer_free_formats'):
469 ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
471 ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
474 audio_ext_preference = ORDER.index(f['ext'])
476 audio_ext_preference = -1
478 if self._downloader.params.get('prefer_free_formats'):
479 ORDER = [u'flv', u'mp4', u'webm']
481 ORDER = [u'webm', u'flv', u'mp4']
483 ext_preference = ORDER.index(f['ext'])
486 audio_ext_preference = 0
490 f.get('quality') if f.get('quality') is not None else -1,
491 f.get('height') if f.get('height') is not None else -1,
492 f.get('width') if f.get('width') is not None else -1,
494 f.get('vbr') if f.get('vbr') is not None else -1,
495 f.get('abr') if f.get('abr') is not None else -1,
496 audio_ext_preference,
497 f.get('filesize') if f.get('filesize') is not None else -1,
500 formats.sort(key=_formats_key)
503 class SearchInfoExtractor(InfoExtractor):
505 Base class for paged search queries extractors.
506 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
507 Instances should define _SEARCH_KEY and _MAX_RESULTS.
511 def _make_valid_url(cls):
512 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
515 def suitable(cls, url):
516 return re.match(cls._make_valid_url(), url) is not None
518 def _real_extract(self, query):
519 mobj = re.match(self._make_valid_url(), query)
521 raise ExtractorError(u'Invalid search query "%s"' % query)
523 prefix = mobj.group('prefix')
524 query = mobj.group('query')
526 return self._get_n_results(query, 1)
527 elif prefix == 'all':
528 return self._get_n_results(query, self._MAX_RESULTS)
532 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
533 elif n > self._MAX_RESULTS:
534 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
535 n = self._MAX_RESULTS
536 return self._get_n_results(query, n)
538 def _get_n_results(self, query, n):
539 """Get a specified number of results for a query"""
540 raise NotImplementedError("This method must be implemented by subclasses")
543 def SEARCH_KEY(self):
544 return self._SEARCH_KEY