10 import xml.etree.ElementTree
15 compat_urllib_parse_urlparse,
26 _NO_DEFAULT = object()
29 class InfoExtractor(object):
30 """Information Extractor class.
32 Information extractors are the classes that, given a URL, extract
33 information about the video (or videos) the URL refers to. This
34 information includes the real video URL, the video title, author and
35 others. The information is stored in a dictionary which is then
36 passed to the FileDownloader. The FileDownloader processes this
37 information possibly downloading the video to the file system, among
38 other possible outcomes.
40 The dictionaries must include the following fields:
43 title: Video title, unescaped.
45 Additionally, it must contain either a formats entry or a url one:
47 formats: A list of dictionaries for each format available, ordered
48 from worst to best quality.
51 * url Mandatory. The URL of the video file
52 * ext Will be calculated from url if missing
53 * format A human-readable description of the format
54 ("mp4 container with h264/opus").
55 Calculated from the format_id, width, height.
56 and format_note fields if missing.
57 * format_id A short description of the format
58 ("mp4_h264_opus" or "19").
59 Technically optional, but strongly recommended.
60 * format_note Additional info about the format
61 ("3D" or "DASH video")
62 * width Width of the video, if known
63 * height Height of the video, if known
64 * resolution Textual description of width and height
65 * tbr Average bitrate of audio and video in KBit/s
66 * abr Average audio bitrate in KBit/s
67 * acodec Name of the audio codec in use
68 * asr Audio sampling rate in Hertz
69 * vbr Average video bitrate in KBit/s
70 * vcodec Name of the video codec in use
71 * container Name of the container format
72 * filesize The number of bytes, if known in advance
73 * filesize_approx An estimate for the number of bytes
74 * player_url SWF Player URL (used for rtmpdump).
75 * protocol The protocol that will be used for the actual
77 "http", "https", "rtsp", "rtmp", "m3u8" or so.
78 * preference Order number of this format. If this field is
79 present and not None, the formats get sorted
80 by this field, regardless of all other values.
81 -1 for default (order by other properties),
82 -2 or smaller for less than default.
83 * quality Order number of the video quality of this
84 format, irrespective of the file format.
85 -1 for default (order by other properties),
86 -2 or smaller for less than default.
88 ext: Video filename extension.
89 format: The video format, defaults to ext (used for --get-format)
90 player_url: SWF Player URL (used for rtmpdump).
92 The following fields are optional:
94 display_id An alternative identifier for the video, not necessarily
95 unique, but available before title. Typically, id is
96 something like "4234987", title "Dancing naked mole rats",
97 and display_id "dancing-naked-mole-rats"
98 thumbnails: A list of dictionaries, with the following entries:
100 * "width" (optional, int)
101 * "height" (optional, int)
102 * "resolution" (optional, string "{width}x{height"},
104 thumbnail: Full URL to a video thumbnail image.
105 description: One-line video description.
106 uploader: Full name of the video uploader.
107 timestamp: UNIX timestamp of the moment the video became available.
108 upload_date: Video upload date (YYYYMMDD).
109 If not explicitly set, calculated from timestamp.
110 uploader_id: Nickname or id of the video uploader.
111 location: Physical location of the video.
112 subtitles: The subtitle file contents as a dictionary in the format
113 {language: subtitles}.
114 duration: Length of the video in seconds, as an integer.
115 view_count: How many users have watched the video on the platform.
116 like_count: Number of positive ratings of the video
117 dislike_count: Number of negative ratings of the video
118 comment_count: Number of comments on the video
119 age_limit: Age restriction for the video, as an integer (years)
120 webpage_url: The url to the video webpage, if given to youtube-dl it
121 should allow to get the same result again. (It will be set
122 by YoutubeDL if it's missing)
123 categories: A list of categories that the video falls in, for example
126 Unless mentioned otherwise, the fields should be Unicode strings.
128 Subclasses of this one should re-define the _real_initialize() and
129 _real_extract() methods and define a _VALID_URL regexp.
130 Probably, they should also be added to the list of extractors.
132 Finally, the _WORKING attribute should be set to False for broken IEs
133 in order to warn the users and skip the tests.
140 def __init__(self, downloader=None):
141 """Constructor. Receives an optional downloader."""
143 self.set_downloader(downloader)
146 def suitable(cls, url):
147 """Receives a URL and returns True if suitable for this IE."""
149 # This does not use has/getattr intentionally - we want to know whether
150 # we have cached the regexp for *this* class, whereas getattr would also
151 # match the superclass
152 if '_VALID_URL_RE' not in cls.__dict__:
153 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
154 return cls._VALID_URL_RE.match(url) is not None
158 """Getter method for _WORKING."""
161 def initialize(self):
162 """Initializes an instance (authentication, etc)."""
164 self._real_initialize()
167 def extract(self, url):
168 """Extracts URL information and returns it in list of dicts."""
170 return self._real_extract(url)
172 def set_downloader(self, downloader):
173 """Sets the downloader for this IE."""
174 self._downloader = downloader
176 def _real_initialize(self):
177 """Real initialization process. Redefine in subclasses."""
180 def _real_extract(self, url):
181 """Real extraction process. Redefine in subclasses."""
186 """A string for getting the InfoExtractor with get_info_extractor"""
187 return cls.__name__[:-2]
191 return type(self).__name__[:-2]
193 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
194 """ Returns the response handle """
196 self.report_download_webpage(video_id)
197 elif note is not False:
199 self.to_screen(u'%s' % (note,))
201 self.to_screen(u'%s: %s' % (video_id, note))
203 return self._downloader.urlopen(url_or_request)
204 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
208 errnote = u'Unable to download webpage'
209 errmsg = u'%s: %s' % (errnote, compat_str(err))
211 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
213 self._downloader.report_warning(errmsg)
216 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
217 """ Returns a tuple (page content as string, URL handle) """
219 # Strip hashes from the URL (#1038)
220 if isinstance(url_or_request, (compat_str, str)):
221 url_or_request = url_or_request.partition('#')[0]
223 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
227 content_type = urlh.headers.get('Content-Type', '')
228 webpage_bytes = urlh.read()
229 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
231 encoding = m.group(1)
233 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
234 webpage_bytes[:1024])
236 encoding = m.group(1).decode('ascii')
237 elif webpage_bytes.startswith(b'\xff\xfe'):
241 if self._downloader.params.get('dump_intermediate_pages', False):
243 url = url_or_request.get_full_url()
244 except AttributeError:
246 self.to_screen(u'Dumping request to ' + url)
247 dump = base64.b64encode(webpage_bytes).decode('ascii')
248 self._downloader.to_screen(dump)
249 if self._downloader.params.get('write_pages', False):
251 url = url_or_request.get_full_url()
252 except AttributeError:
254 basen = '%s_%s' % (video_id, url)
256 h = u'___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
257 basen = basen[:240 - len(h)] + h
258 raw_filename = basen + '.dump'
259 filename = sanitize_filename(raw_filename, restricted=True)
260 self.to_screen(u'Saving request to ' + filename)
261 with open(filename, 'wb') as outf:
262 outf.write(webpage_bytes)
265 content = webpage_bytes.decode(encoding, 'replace')
267 content = webpage_bytes.decode('utf-8', 'replace')
269 if (u'<title>Access to this site is blocked</title>' in content and
270 u'Websense' in content[:512]):
271 msg = u'Access to this webpage has been blocked by Websense filtering software in your network.'
272 blocked_iframe = self._html_search_regex(
273 r'<iframe src="([^"]+)"', content,
274 u'Websense information URL', default=None)
276 msg += u' Visit %s for more details' % blocked_iframe
277 raise ExtractorError(msg, expected=True)
279 return (content, urlh)
281 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
282 """ Returns the data of the page as a string """
283 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
290 def _download_xml(self, url_or_request, video_id,
291 note=u'Downloading XML', errnote=u'Unable to download XML',
292 transform_source=None, fatal=True):
293 """Return the xml as an xml.etree.ElementTree.Element"""
294 xml_string = self._download_webpage(
295 url_or_request, video_id, note, errnote, fatal=fatal)
296 if xml_string is False:
299 xml_string = transform_source(xml_string)
300 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
302 def _download_json(self, url_or_request, video_id,
303 note=u'Downloading JSON metadata',
304 errnote=u'Unable to download JSON metadata',
305 transform_source=None,
307 json_string = self._download_webpage(
308 url_or_request, video_id, note, errnote, fatal=fatal)
309 if (not fatal) and json_string is False:
312 json_string = transform_source(json_string)
314 return json.loads(json_string)
315 except ValueError as ve:
316 raise ExtractorError('Failed to download JSON', cause=ve)
318 def report_warning(self, msg, video_id=None):
319 idstr = u'' if video_id is None else u'%s: ' % video_id
320 self._downloader.report_warning(
321 u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
323 def to_screen(self, msg):
324 """Print msg to screen, prefixing it with '[ie_name]'"""
325 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
327 def report_extraction(self, id_or_name):
328 """Report information extraction."""
329 self.to_screen(u'%s: Extracting information' % id_or_name)
331 def report_download_webpage(self, video_id):
332 """Report webpage download."""
333 self.to_screen(u'%s: Downloading webpage' % video_id)
335 def report_age_confirmation(self):
336 """Report attempt to confirm age."""
337 self.to_screen(u'Confirming age')
339 def report_login(self):
340 """Report attempt to log in."""
341 self.to_screen(u'Logging in')
343 #Methods for following #608
345 def url_result(url, ie=None, video_id=None):
346 """Returns a url that points to a page that should be processed"""
347 #TODO: ie should be the class used for getting the info
348 video_info = {'_type': 'url',
351 if video_id is not None:
352 video_info['id'] = video_id
355 def playlist_result(entries, playlist_id=None, playlist_title=None):
356 """Returns a playlist"""
357 video_info = {'_type': 'playlist',
360 video_info['id'] = playlist_id
362 video_info['title'] = playlist_title
365 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
367 Perform a regex search on the given string, using a single or a list of
368 patterns returning the first matching group.
369 In case of failure return a default value or raise a WARNING or a
370 RegexNotFoundError, depending on fatal, specifying the field name.
372 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
373 mobj = re.search(pattern, string, flags)
376 mobj = re.search(p, string, flags)
380 if os.name != 'nt' and sys.stderr.isatty():
381 _name = u'\033[0;34m%s\033[0m' % name
386 # return the first matching group
387 return next(g for g in mobj.groups() if g is not None)
388 elif default is not _NO_DEFAULT:
391 raise RegexNotFoundError(u'Unable to extract %s' % _name)
393 self._downloader.report_warning(u'unable to extract %s; '
394 u'please report this issue on http://yt-dl.org/bug' % _name)
397 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
399 Like _search_regex, but strips HTML tags and unescapes entities.
401 res = self._search_regex(pattern, string, name, default, fatal, flags)
403 return clean_html(res).strip()
407 def _get_login_info(self):
409 Get the the login info as (username, password)
410 It will look in the netrc file using the _NETRC_MACHINE value
411 If there's no info available, return (None, None)
413 if self._downloader is None:
418 downloader_params = self._downloader.params
420 # Attempt to use provided username and password or .netrc data
421 if downloader_params.get('username', None) is not None:
422 username = downloader_params['username']
423 password = downloader_params['password']
424 elif downloader_params.get('usenetrc', False):
426 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
431 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
432 except (IOError, netrc.NetrcParseError) as err:
433 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
435 return (username, password)
437 def _get_tfa_info(self):
439 Get the two-factor authentication info
440 TODO - asking the user will be required for sms/phone verify
441 currently just uses the command line option
442 If there's no info available, return None
444 if self._downloader is None:
445 self.to_screen("no downloader")
447 downloader_params = self._downloader.params
449 if downloader_params.get('twofactor', None) is not None:
450 return downloader_params['twofactor']
452 self.to_screen("param is None")
455 # Helper functions for extracting OpenGraph info
457 def _og_regexes(prop):
458 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
459 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
460 template = r'<meta[^>]+?%s[^>]+?%s'
462 template % (property_re, content_re),
463 template % (content_re, property_re),
466 def _og_search_property(self, prop, html, name=None, **kargs):
468 name = 'OpenGraph %s' % prop
469 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
472 return unescapeHTML(escaped)
474 def _og_search_thumbnail(self, html, **kargs):
475 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
477 def _og_search_description(self, html, **kargs):
478 return self._og_search_property('description', html, fatal=False, **kargs)
480 def _og_search_title(self, html, **kargs):
481 return self._og_search_property('title', html, **kargs)
483 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
484 regexes = self._og_regexes('video')
485 if secure: regexes = self._og_regexes('video:secure_url') + regexes
486 return self._html_search_regex(regexes, html, name, **kargs)
488 def _og_search_url(self, html, **kargs):
489 return self._og_search_property('url', html, **kargs)
491 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
492 if display_name is None:
494 return self._html_search_regex(
496 (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
497 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
498 html, display_name, fatal=fatal, **kwargs)
500 def _dc_search_uploader(self, html):
501 return self._html_search_meta('dc.creator', html, 'uploader')
503 def _rta_search(self, html):
504 # See http://www.rtalabel.org/index.php?content=howtofaq#single
505 if re.search(r'(?ix)<meta\s+name="rating"\s+'
506 r' content="RTA-5042-1996-1400-1577-RTA"',
511 def _media_rating_search(self, html):
512 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
513 rating = self._html_search_meta('rating', html)
525 return RATING_TABLE.get(rating.lower(), None)
527 def _twitter_search_player(self, html):
528 return self._html_search_meta('twitter:player', html,
529 'twitter card player')
531 def _sort_formats(self, formats):
533 raise ExtractorError(u'No video formats found')
536 # TODO remove the following workaround
537 from ..utils import determine_ext
538 if not f.get('ext') and 'url' in f:
539 f['ext'] = determine_ext(f['url'])
541 preference = f.get('preference')
542 if preference is None:
543 proto = f.get('protocol')
545 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
547 preference = 0 if proto in ['http', 'https'] else -0.1
548 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
551 if f.get('vcodec') == 'none': # audio only
552 if self._downloader.params.get('prefer_free_formats'):
553 ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
555 ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
558 audio_ext_preference = ORDER.index(f['ext'])
560 audio_ext_preference = -1
562 if self._downloader.params.get('prefer_free_formats'):
563 ORDER = [u'flv', u'mp4', u'webm']
565 ORDER = [u'webm', u'flv', u'mp4']
567 ext_preference = ORDER.index(f['ext'])
570 audio_ext_preference = 0
574 f.get('quality') if f.get('quality') is not None else -1,
575 f.get('height') if f.get('height') is not None else -1,
576 f.get('width') if f.get('width') is not None else -1,
578 f.get('tbr') if f.get('tbr') is not None else -1,
579 f.get('vbr') if f.get('vbr') is not None else -1,
580 f.get('abr') if f.get('abr') is not None else -1,
581 audio_ext_preference,
582 f.get('filesize') if f.get('filesize') is not None else -1,
583 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
586 formats.sort(key=_formats_key)
588 def http_scheme(self):
589 """ Either "https:" or "https:", depending on the user's preferences """
592 if self._downloader.params.get('prefer_insecure', False)
595 def _proto_relative_url(self, url, scheme=None):
598 if url.startswith('//'):
600 scheme = self.http_scheme()
605 def _sleep(self, timeout, video_id, msg_template=None):
606 if msg_template is None:
607 msg_template = u'%(video_id)s: Waiting for %(timeout)s seconds'
608 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
612 def _extract_f4m_formats(self, manifest_url, video_id):
613 manifest = self._download_xml(
614 manifest_url, video_id, 'Downloading f4m manifest',
615 'Unable to download f4m manifest')
618 for media_el in manifest.findall('{http://ns.adobe.com/f4m/1.0}media'):
622 'tbr': int_or_none(media_el.attrib.get('bitrate')),
623 'width': int_or_none(media_el.attrib.get('width')),
624 'height': int_or_none(media_el.attrib.get('height')),
626 self._sort_formats(formats)
631 class SearchInfoExtractor(InfoExtractor):
633 Base class for paged search queries extractors.
634 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
635 Instances should define _SEARCH_KEY and _MAX_RESULTS.
639 def _make_valid_url(cls):
640 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
643 def suitable(cls, url):
644 return re.match(cls._make_valid_url(), url) is not None
646 def _real_extract(self, query):
647 mobj = re.match(self._make_valid_url(), query)
649 raise ExtractorError(u'Invalid search query "%s"' % query)
651 prefix = mobj.group('prefix')
652 query = mobj.group('query')
654 return self._get_n_results(query, 1)
655 elif prefix == 'all':
656 return self._get_n_results(query, self._MAX_RESULTS)
660 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
661 elif n > self._MAX_RESULTS:
662 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
663 n = self._MAX_RESULTS
664 return self._get_n_results(query, n)
666 def _get_n_results(self, query, n):
667 """Get a specified number of results for a query"""
668 raise NotImplementedError("This method must be implemented by subclasses")
671 def SEARCH_KEY(self):
672 return self._SEARCH_KEY