7 import xml.etree.ElementTree
12 compat_urllib_parse_urlparse,
22 _NO_DEFAULT = object()
25 class InfoExtractor(object):
26 """Information Extractor class.
28 Information extractors are the classes that, given a URL, extract
29 information about the video (or videos) the URL refers to. This
30 information includes the real video URL, the video title, author and
31 others. The information is stored in a dictionary which is then
32 passed to the FileDownloader. The FileDownloader processes this
33 information possibly downloading the video to the file system, among
34 other possible outcomes.
36 The dictionaries must include the following fields:
39 title: Video title, unescaped.
41 Additionally, it must contain either a formats entry or a url one:
43 formats: A list of dictionaries for each format available, ordered
44 from worst to best quality.
47 * url Mandatory. The URL of the video file
48 * ext Will be calculated from url if missing
49 * format A human-readable description of the format
50 ("mp4 container with h264/opus").
51 Calculated from the format_id, width, height.
52 and format_note fields if missing.
53 * format_id A short description of the format
54 ("mp4_h264_opus" or "19").
55 Technically optional, but strongly recommended.
56 * format_note Additional info about the format
57 ("3D" or "DASH video")
58 * width Width of the video, if known
59 * height Height of the video, if known
60 * resolution Textual description of width and height
61 * tbr Average bitrate of audio and video in KBit/s
62 * abr Average audio bitrate in KBit/s
63 * acodec Name of the audio codec in use
64 * vbr Average video bitrate in KBit/s
65 * vcodec Name of the video codec in use
66 * filesize The number of bytes, if known in advance
67 * player_url SWF Player URL (used for rtmpdump).
68 * protocol The protocol that will be used for the actual
70 "http", "https", "rtsp", "rtmp" or so.
71 * preference Order number of this format. If this field is
72 present, the formats get sorted by this field.
73 -1 for default (order by other properties),
74 -2 or smaller for less than default.
76 ext: Video filename extension.
77 format: The video format, defaults to ext (used for --get-format)
78 player_url: SWF Player URL (used for rtmpdump).
80 The following fields are optional:
82 thumbnails: A list of dictionaries (with the entries "resolution" and
83 "url") for the varying thumbnails
84 thumbnail: Full URL to a video thumbnail image.
85 description: One-line video description.
86 uploader: Full name of the video uploader.
87 upload_date: Video upload date (YYYYMMDD).
88 uploader_id: Nickname or id of the video uploader.
89 location: Physical location of the video.
90 subtitles: The subtitle file contents as a dictionary in the format
91 {language: subtitles}.
92 duration: Length of the video in seconds, as an integer.
93 view_count: How many users have watched the video on the platform.
94 like_count: Number of positive ratings of the video
95 dislike_count: Number of negative ratings of the video
96 comment_count: Number of comments on the video
97 age_limit: Age restriction for the video, as an integer (years)
98 webpage_url: The url to the video webpage, if given to youtube-dl it
99 should allow to get the same result again. (It will be set
100 by YoutubeDL if it's missing)
102 Unless mentioned otherwise, the fields should be Unicode strings.
104 Subclasses of this one should re-define the _real_initialize() and
105 _real_extract() methods and define a _VALID_URL regexp.
106 Probably, they should also be added to the list of extractors.
108 _real_extract() must return a *list* of information dictionaries as
111 Finally, the _WORKING attribute should be set to False for broken IEs
112 in order to warn the users and skip the tests.
119 def __init__(self, downloader=None):
120 """Constructor. Receives an optional downloader."""
122 self.set_downloader(downloader)
125 def suitable(cls, url):
126 """Receives a URL and returns True if suitable for this IE."""
128 # This does not use has/getattr intentionally - we want to know whether
129 # we have cached the regexp for *this* class, whereas getattr would also
130 # match the superclass
131 if '_VALID_URL_RE' not in cls.__dict__:
132 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
133 return cls._VALID_URL_RE.match(url) is not None
137 """Getter method for _WORKING."""
140 def initialize(self):
141 """Initializes an instance (authentication, etc)."""
143 self._real_initialize()
146 def extract(self, url):
147 """Extracts URL information and returns it in list of dicts."""
149 return self._real_extract(url)
151 def set_downloader(self, downloader):
152 """Sets the downloader for this IE."""
153 self._downloader = downloader
155 def _real_initialize(self):
156 """Real initialization process. Redefine in subclasses."""
159 def _real_extract(self, url):
160 """Real extraction process. Redefine in subclasses."""
165 """A string for getting the InfoExtractor with get_info_extractor"""
166 return cls.__name__[:-2]
170 return type(self).__name__[:-2]
172 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
173 """ Returns the response handle """
175 self.report_download_webpage(video_id)
176 elif note is not False:
178 self.to_screen(u'%s' % (note,))
180 self.to_screen(u'%s: %s' % (video_id, note))
182 return self._downloader.urlopen(url_or_request)
183 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
187 errnote = u'Unable to download webpage'
188 errmsg = u'%s: %s' % (errnote, compat_str(err))
190 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
192 self._downloader.report_warning(errmsg)
195 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
196 """ Returns a tuple (page content as string, URL handle) """
198 # Strip hashes from the URL (#1038)
199 if isinstance(url_or_request, (compat_str, str)):
200 url_or_request = url_or_request.partition('#')[0]
202 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
206 content_type = urlh.headers.get('Content-Type', '')
207 webpage_bytes = urlh.read()
208 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
210 encoding = m.group(1)
212 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
213 webpage_bytes[:1024])
215 encoding = m.group(1).decode('ascii')
218 if self._downloader.params.get('dump_intermediate_pages', False):
220 url = url_or_request.get_full_url()
221 except AttributeError:
223 self.to_screen(u'Dumping request to ' + url)
224 dump = base64.b64encode(webpage_bytes).decode('ascii')
225 self._downloader.to_screen(dump)
226 if self._downloader.params.get('write_pages', False):
228 url = url_or_request.get_full_url()
229 except AttributeError:
231 raw_filename = ('%s_%s.dump' % (video_id, url))
232 filename = sanitize_filename(raw_filename, restricted=True)
233 self.to_screen(u'Saving request to ' + filename)
234 with open(filename, 'wb') as outf:
235 outf.write(webpage_bytes)
237 content = webpage_bytes.decode(encoding, 'replace')
238 return (content, urlh)
240 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
241 """ Returns the data of the page as a string """
242 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
249 def _download_xml(self, url_or_request, video_id,
250 note=u'Downloading XML', errnote=u'Unable to download XML',
251 transform_source=None):
252 """Return the xml as an xml.etree.ElementTree.Element"""
253 xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
255 xml_string = transform_source(xml_string)
256 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
258 def report_warning(self, msg, video_id=None):
259 idstr = u'' if video_id is None else u'%s: ' % video_id
260 self._downloader.report_warning(
261 u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
263 def to_screen(self, msg):
264 """Print msg to screen, prefixing it with '[ie_name]'"""
265 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
267 def report_extraction(self, id_or_name):
268 """Report information extraction."""
269 self.to_screen(u'%s: Extracting information' % id_or_name)
271 def report_download_webpage(self, video_id):
272 """Report webpage download."""
273 self.to_screen(u'%s: Downloading webpage' % video_id)
275 def report_age_confirmation(self):
276 """Report attempt to confirm age."""
277 self.to_screen(u'Confirming age')
279 def report_login(self):
280 """Report attempt to log in."""
281 self.to_screen(u'Logging in')
283 #Methods for following #608
285 def url_result(url, ie=None, video_id=None):
286 """Returns a url that points to a page that should be processed"""
287 #TODO: ie should be the class used for getting the info
288 video_info = {'_type': 'url',
291 if video_id is not None:
292 video_info['id'] = video_id
295 def playlist_result(entries, playlist_id=None, playlist_title=None):
296 """Returns a playlist"""
297 video_info = {'_type': 'playlist',
300 video_info['id'] = playlist_id
302 video_info['title'] = playlist_title
305 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
307 Perform a regex search on the given string, using a single or a list of
308 patterns returning the first matching group.
309 In case of failure return a default value or raise a WARNING or a
310 RegexNotFoundError, depending on fatal, specifying the field name.
312 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
313 mobj = re.search(pattern, string, flags)
316 mobj = re.search(p, string, flags)
319 if os.name != 'nt' and sys.stderr.isatty():
320 _name = u'\033[0;34m%s\033[0m' % name
325 # return the first matching group
326 return next(g for g in mobj.groups() if g is not None)
327 elif default is not _NO_DEFAULT:
330 raise RegexNotFoundError(u'Unable to extract %s' % _name)
332 self._downloader.report_warning(u'unable to extract %s; '
333 u'please report this issue on http://yt-dl.org/bug' % _name)
336 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
338 Like _search_regex, but strips HTML tags and unescapes entities.
340 res = self._search_regex(pattern, string, name, default, fatal, flags)
342 return clean_html(res).strip()
346 def _get_login_info(self):
348 Get the the login info as (username, password)
349 It will look in the netrc file using the _NETRC_MACHINE value
350 If there's no info available, return (None, None)
352 if self._downloader is None:
357 downloader_params = self._downloader.params
359 # Attempt to use provided username and password or .netrc data
360 if downloader_params.get('username', None) is not None:
361 username = downloader_params['username']
362 password = downloader_params['password']
363 elif downloader_params.get('usenetrc', False):
365 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
370 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
371 except (IOError, netrc.NetrcParseError) as err:
372 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
374 return (username, password)
376 # Helper functions for extracting OpenGraph info
378 def _og_regexes(prop):
379 content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
380 property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
381 template = r'<meta[^>]+?%s[^>]+?%s'
383 template % (property_re, content_re),
384 template % (content_re, property_re),
387 def _og_search_property(self, prop, html, name=None, **kargs):
389 name = 'OpenGraph %s' % prop
390 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
393 return unescapeHTML(escaped)
395 def _og_search_thumbnail(self, html, **kargs):
396 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
398 def _og_search_description(self, html, **kargs):
399 return self._og_search_property('description', html, fatal=False, **kargs)
401 def _og_search_title(self, html, **kargs):
402 return self._og_search_property('title', html, **kargs)
404 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
405 regexes = self._og_regexes('video')
406 if secure: regexes = self._og_regexes('video:secure_url') + regexes
407 return self._html_search_regex(regexes, html, name, **kargs)
409 def _html_search_meta(self, name, html, display_name=None):
410 if display_name is None:
412 return self._html_search_regex(
414 (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
415 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
416 html, display_name, fatal=False)
418 def _dc_search_uploader(self, html):
419 return self._html_search_meta('dc.creator', html, 'uploader')
421 def _rta_search(self, html):
422 # See http://www.rtalabel.org/index.php?content=howtofaq#single
423 if re.search(r'(?ix)<meta\s+name="rating"\s+'
424 r' content="RTA-5042-1996-1400-1577-RTA"',
429 def _media_rating_search(self, html):
430 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
431 rating = self._html_search_meta('rating', html)
443 return RATING_TABLE.get(rating.lower(), None)
445 def _sort_formats(self, formats):
447 # TODO remove the following workaround
448 from ..utils import determine_ext
449 if not f.get('ext') and 'url' in f:
450 f['ext'] = determine_ext(f['url'])
452 preference = f.get('preference')
453 if preference is None:
454 proto = f.get('protocol')
456 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
458 preference = 0 if proto in ['http', 'https'] else -0.1
459 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
462 if f.get('vcodec') == 'none': # audio only
463 if self._downloader.params.get('prefer_free_formats'):
464 ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
466 ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
469 audio_ext_preference = ORDER.index(f['ext'])
471 audio_ext_preference = -1
473 if self._downloader.params.get('prefer_free_formats'):
474 ORDER = [u'flv', u'mp4', u'webm']
476 ORDER = [u'webm', u'flv', u'mp4']
478 ext_preference = ORDER.index(f['ext'])
481 audio_ext_preference = 0
485 f.get('height') if f.get('height') is not None else -1,
486 f.get('width') if f.get('width') is not None else -1,
488 f.get('vbr') if f.get('vbr') is not None else -1,
489 f.get('abr') if f.get('abr') is not None else -1,
490 audio_ext_preference,
491 f.get('filesize') if f.get('filesize') is not None else -1,
494 formats.sort(key=_formats_key)
497 class SearchInfoExtractor(InfoExtractor):
499 Base class for paged search queries extractors.
500 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
501 Instances should define _SEARCH_KEY and _MAX_RESULTS.
505 def _make_valid_url(cls):
506 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
509 def suitable(cls, url):
510 return re.match(cls._make_valid_url(), url) is not None
512 def _real_extract(self, query):
513 mobj = re.match(self._make_valid_url(), query)
515 raise ExtractorError(u'Invalid search query "%s"' % query)
517 prefix = mobj.group('prefix')
518 query = mobj.group('query')
520 return self._get_n_results(query, 1)
521 elif prefix == 'all':
522 return self._get_n_results(query, self._MAX_RESULTS)
526 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
527 elif n > self._MAX_RESULTS:
528 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
529 n = self._MAX_RESULTS
530 return self._get_n_results(query, n)
532 def _get_n_results(self, query, n):
533 """Get a specified number of results for a query"""
534 raise NotImplementedError("This method must be implemented by subclasses")
537 def SEARCH_KEY(self):
538 return self._SEARCH_KEY