7 import xml.etree.ElementTree
12 compat_urllib_parse_urlparse,
22 _NO_DEFAULT = object()
25 class InfoExtractor(object):
26 """Information Extractor class.
28 Information extractors are the classes that, given a URL, extract
29 information about the video (or videos) the URL refers to. This
30 information includes the real video URL, the video title, author and
31 others. The information is stored in a dictionary which is then
32 passed to the FileDownloader. The FileDownloader processes this
33 information possibly downloading the video to the file system, among
34 other possible outcomes.
36 The dictionaries must include the following fields:
39 title: Video title, unescaped.
41 Additionally, it must contain either a formats entry or a url one:
43 formats: A list of dictionaries for each format available, ordered
44 from worst to best quality.
47 * url Mandatory. The URL of the video file
48 * ext Will be calculated from url if missing
49 * format A human-readable description of the format
50 ("mp4 container with h264/opus").
51 Calculated from the format_id, width, height.
52 and format_note fields if missing.
53 * format_id A short description of the format
54 ("mp4_h264_opus" or "19")
55 * format_note Additional info about the format
56 ("3D" or "DASH video")
57 * width Width of the video, if known
58 * height Height of the video, if known
59 * resolution Textual description of width and height
60 * abr Average audio bitrate in KBit/s
61 * acodec Name of the audio codec in use
62 * vbr Average video bitrate in KBit/s
63 * vcodec Name of the video codec in use
64 * filesize The number of bytes, if known in advance
65 * player_url SWF Player URL (used for rtmpdump).
66 * protocol The protocol that will be used for the actual
68 "http", "https", "rtsp", "rtmp" or so.
69 * preference Order number of this format. If this field is
70 present, the formats get sorted by this field.
71 -1 for default (order by other properties),
72 -2 or smaller for less than default.
74 ext: Video filename extension.
75 format: The video format, defaults to ext (used for --get-format)
76 player_url: SWF Player URL (used for rtmpdump).
78 The following fields are optional:
80 thumbnails: A list of dictionaries (with the entries "resolution" and
81 "url") for the varying thumbnails
82 thumbnail: Full URL to a video thumbnail image.
83 description: One-line video description.
84 uploader: Full name of the video uploader.
85 upload_date: Video upload date (YYYYMMDD).
86 uploader_id: Nickname or id of the video uploader.
87 location: Physical location of the video.
88 subtitles: The subtitle file contents as a dictionary in the format
89 {language: subtitles}.
90 duration: Length of the video in seconds, as an integer.
91 view_count: How many users have watched the video on the platform.
92 like_count: Number of positive ratings of the video
93 dislike_count: Number of negative ratings of the video
94 comment_count: Number of comments on the video
95 age_limit: Age restriction for the video, as an integer (years)
96 webpage_url: The url to the video webpage, if given to youtube-dl it
97 should allow to get the same result again. (It will be set
98 by YoutubeDL if it's missing)
100 Unless mentioned otherwise, the fields should be Unicode strings.
102 Subclasses of this one should re-define the _real_initialize() and
103 _real_extract() methods and define a _VALID_URL regexp.
104 Probably, they should also be added to the list of extractors.
106 _real_extract() must return a *list* of information dictionaries as
109 Finally, the _WORKING attribute should be set to False for broken IEs
110 in order to warn the users and skip the tests.
117 def __init__(self, downloader=None):
118 """Constructor. Receives an optional downloader."""
120 self.set_downloader(downloader)
123 def suitable(cls, url):
124 """Receives a URL and returns True if suitable for this IE."""
126 # This does not use has/getattr intentionally - we want to know whether
127 # we have cached the regexp for *this* class, whereas getattr would also
128 # match the superclass
129 if '_VALID_URL_RE' not in cls.__dict__:
130 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
131 return cls._VALID_URL_RE.match(url) is not None
135 """Getter method for _WORKING."""
138 def initialize(self):
139 """Initializes an instance (authentication, etc)."""
141 self._real_initialize()
144 def extract(self, url):
145 """Extracts URL information and returns it in list of dicts."""
147 return self._real_extract(url)
149 def set_downloader(self, downloader):
150 """Sets the downloader for this IE."""
151 self._downloader = downloader
153 def _real_initialize(self):
154 """Real initialization process. Redefine in subclasses."""
157 def _real_extract(self, url):
158 """Real extraction process. Redefine in subclasses."""
163 """A string for getting the InfoExtractor with get_info_extractor"""
164 return cls.__name__[:-2]
168 return type(self).__name__[:-2]
170 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
171 """ Returns the response handle """
173 self.report_download_webpage(video_id)
174 elif note is not False:
176 self.to_screen(u'%s' % (note,))
178 self.to_screen(u'%s: %s' % (video_id, note))
180 return self._downloader.urlopen(url_or_request)
181 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
185 errnote = u'Unable to download webpage'
186 errmsg = u'%s: %s' % (errnote, compat_str(err))
188 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
190 self._downloader.report_warning(errmsg)
193 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
194 """ Returns a tuple (page content as string, URL handle) """
196 # Strip hashes from the URL (#1038)
197 if isinstance(url_or_request, (compat_str, str)):
198 url_or_request = url_or_request.partition('#')[0]
200 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
204 content_type = urlh.headers.get('Content-Type', '')
205 webpage_bytes = urlh.read()
206 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
208 encoding = m.group(1)
210 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
211 webpage_bytes[:1024])
213 encoding = m.group(1).decode('ascii')
216 if self._downloader.params.get('dump_intermediate_pages', False):
218 url = url_or_request.get_full_url()
219 except AttributeError:
221 self.to_screen(u'Dumping request to ' + url)
222 dump = base64.b64encode(webpage_bytes).decode('ascii')
223 self._downloader.to_screen(dump)
224 if self._downloader.params.get('write_pages', False):
226 url = url_or_request.get_full_url()
227 except AttributeError:
229 raw_filename = ('%s_%s.dump' % (video_id, url))
230 filename = sanitize_filename(raw_filename, restricted=True)
231 self.to_screen(u'Saving request to ' + filename)
232 with open(filename, 'wb') as outf:
233 outf.write(webpage_bytes)
235 content = webpage_bytes.decode(encoding, 'replace')
236 return (content, urlh)
238 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
239 """ Returns the data of the page as a string """
240 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
247 def _download_xml(self, url_or_request, video_id,
248 note=u'Downloading XML', errnote=u'Unable to download XML',
249 transform_source=None):
250 """Return the xml as an xml.etree.ElementTree.Element"""
251 xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
253 xml_string = transform_source(xml_string)
254 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
256 def report_warning(self, msg, video_id=None):
257 idstr = u'' if video_id is None else u'%s: ' % video_id
258 self._downloader.report_warning(
259 u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
261 def to_screen(self, msg):
262 """Print msg to screen, prefixing it with '[ie_name]'"""
263 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
265 def report_extraction(self, id_or_name):
266 """Report information extraction."""
267 self.to_screen(u'%s: Extracting information' % id_or_name)
269 def report_download_webpage(self, video_id):
270 """Report webpage download."""
271 self.to_screen(u'%s: Downloading webpage' % video_id)
273 def report_age_confirmation(self):
274 """Report attempt to confirm age."""
275 self.to_screen(u'Confirming age')
277 def report_login(self):
278 """Report attempt to log in."""
279 self.to_screen(u'Logging in')
281 #Methods for following #608
283 def url_result(url, ie=None, video_id=None):
284 """Returns a url that points to a page that should be processed"""
285 #TODO: ie should be the class used for getting the info
286 video_info = {'_type': 'url',
289 if video_id is not None:
290 video_info['id'] = video_id
293 def playlist_result(entries, playlist_id=None, playlist_title=None):
294 """Returns a playlist"""
295 video_info = {'_type': 'playlist',
298 video_info['id'] = playlist_id
300 video_info['title'] = playlist_title
303 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
305 Perform a regex search on the given string, using a single or a list of
306 patterns returning the first matching group.
307 In case of failure return a default value or raise a WARNING or a
308 RegexNotFoundError, depending on fatal, specifying the field name.
310 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
311 mobj = re.search(pattern, string, flags)
314 mobj = re.search(p, string, flags)
317 if os.name != 'nt' and sys.stderr.isatty():
318 _name = u'\033[0;34m%s\033[0m' % name
323 # return the first matching group
324 return next(g for g in mobj.groups() if g is not None)
325 elif default is not _NO_DEFAULT:
328 raise RegexNotFoundError(u'Unable to extract %s' % _name)
330 self._downloader.report_warning(u'unable to extract %s; '
331 u'please report this issue on http://yt-dl.org/bug' % _name)
334 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
336 Like _search_regex, but strips HTML tags and unescapes entities.
338 res = self._search_regex(pattern, string, name, default, fatal, flags)
340 return clean_html(res).strip()
344 def _get_login_info(self):
346 Get the the login info as (username, password)
347 It will look in the netrc file using the _NETRC_MACHINE value
348 If there's no info available, return (None, None)
350 if self._downloader is None:
355 downloader_params = self._downloader.params
357 # Attempt to use provided username and password or .netrc data
358 if downloader_params.get('username', None) is not None:
359 username = downloader_params['username']
360 password = downloader_params['password']
361 elif downloader_params.get('usenetrc', False):
363 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
368 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
369 except (IOError, netrc.NetrcParseError) as err:
370 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
372 return (username, password)
374 # Helper functions for extracting OpenGraph info
376 def _og_regexes(prop):
377 content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
378 property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
379 template = r'<meta[^>]+?%s[^>]+?%s'
381 template % (property_re, content_re),
382 template % (content_re, property_re),
385 def _og_search_property(self, prop, html, name=None, **kargs):
387 name = 'OpenGraph %s' % prop
388 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
391 return unescapeHTML(escaped)
393 def _og_search_thumbnail(self, html, **kargs):
394 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
396 def _og_search_description(self, html, **kargs):
397 return self._og_search_property('description', html, fatal=False, **kargs)
399 def _og_search_title(self, html, **kargs):
400 return self._og_search_property('title', html, **kargs)
402 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
403 regexes = self._og_regexes('video')
404 if secure: regexes = self._og_regexes('video:secure_url') + regexes
405 return self._html_search_regex(regexes, html, name, **kargs)
407 def _html_search_meta(self, name, html, display_name=None):
408 if display_name is None:
410 return self._html_search_regex(
412 (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
413 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
414 html, display_name, fatal=False)
416 def _dc_search_uploader(self, html):
417 return self._html_search_meta('dc.creator', html, 'uploader')
419 def _rta_search(self, html):
420 # See http://www.rtalabel.org/index.php?content=howtofaq#single
421 if re.search(r'(?ix)<meta\s+name="rating"\s+'
422 r' content="RTA-5042-1996-1400-1577-RTA"',
427 def _media_rating_search(self, html):
428 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
429 rating = self._html_search_meta('rating', html)
441 return RATING_TABLE.get(rating.lower(), None)
443 def _sort_formats(self, formats):
445 # TODO remove the following workaround
446 from ..utils import determine_ext
447 if not f.get('ext') and 'url' in f:
448 f['ext'] = determine_ext(f['url'])
450 preference = f.get('preference')
451 if preference is None:
452 proto = f.get('protocol')
454 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
456 preference = 0 if proto in ['http', 'https'] else -0.1
457 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
460 if f.get('vcodec') == 'none': # audio only
461 if self._downloader.params.get('prefer_free_formats'):
462 ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
464 ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
467 audio_ext_preference = ORDER.index(f['ext'])
469 audio_ext_preference = -1
471 if self._downloader.params.get('prefer_free_formats'):
472 ORDER = [u'flv', u'mp4', u'webm']
474 ORDER = [u'webm', u'flv', u'mp4']
476 ext_preference = ORDER.index(f['ext'])
479 audio_ext_preference = 0
483 f.get('height') if f.get('height') is not None else -1,
484 f.get('width') if f.get('width') is not None else -1,
486 f.get('vbr') if f.get('vbr') is not None else -1,
487 f.get('abr') if f.get('abr') is not None else -1,
488 audio_ext_preference,
489 f.get('filesize') if f.get('filesize') is not None else -1,
492 formats.sort(key=_formats_key)
495 class SearchInfoExtractor(InfoExtractor):
497 Base class for paged search queries extractors.
498 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
499 Instances should define _SEARCH_KEY and _MAX_RESULTS.
503 def _make_valid_url(cls):
504 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
507 def suitable(cls, url):
508 return re.match(cls._make_valid_url(), url) is not None
510 def _real_extract(self, query):
511 mobj = re.match(self._make_valid_url(), query)
513 raise ExtractorError(u'Invalid search query "%s"' % query)
515 prefix = mobj.group('prefix')
516 query = mobj.group('query')
518 return self._get_n_results(query, 1)
519 elif prefix == 'all':
520 return self._get_n_results(query, self._MAX_RESULTS)
524 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
525 elif n > self._MAX_RESULTS:
526 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
527 n = self._MAX_RESULTS
528 return self._get_n_results(query, n)
530 def _get_n_results(self, query, n):
531 """Get a specified number of results for a query"""
532 raise NotImplementedError("This method must be implemented by subclasses")
535 def SEARCH_KEY(self):
536 return self._SEARCH_KEY