10 import xml.etree.ElementTree
15 compat_urllib_parse_urlparse,
25 _NO_DEFAULT = object()
28 class InfoExtractor(object):
29 """Information Extractor class.
31 Information extractors are the classes that, given a URL, extract
32 information about the video (or videos) the URL refers to. This
33 information includes the real video URL, the video title, author and
34 others. The information is stored in a dictionary which is then
35 passed to the FileDownloader. The FileDownloader processes this
36 information possibly downloading the video to the file system, among
37 other possible outcomes.
39 The dictionaries must include the following fields:
42 title: Video title, unescaped.
44 Additionally, it must contain either a formats entry or a url one:
46 formats: A list of dictionaries for each format available, ordered
47 from worst to best quality.
50 * url Mandatory. The URL of the video file
51 * ext Will be calculated from url if missing
52 * format A human-readable description of the format
53 ("mp4 container with h264/opus").
54 Calculated from the format_id, width, height.
55 and format_note fields if missing.
56 * format_id A short description of the format
57 ("mp4_h264_opus" or "19").
58 Technically optional, but strongly recommended.
59 * format_note Additional info about the format
60 ("3D" or "DASH video")
61 * width Width of the video, if known
62 * height Height of the video, if known
63 * resolution Textual description of width and height
64 * tbr Average bitrate of audio and video in KBit/s
65 * abr Average audio bitrate in KBit/s
66 * acodec Name of the audio codec in use
67 * asr Audio sampling rate in Hertz
68 * vbr Average video bitrate in KBit/s
69 * vcodec Name of the video codec in use
70 * container Name of the container format
71 * filesize The number of bytes, if known in advance
72 * filesize_approx An estimate for the number of bytes
73 * player_url SWF Player URL (used for rtmpdump).
74 * protocol The protocol that will be used for the actual
76 "http", "https", "rtsp", "rtmp", "m3u8" or so.
77 * preference Order number of this format. If this field is
78 present and not None, the formats get sorted
79 by this field, regardless of all other values.
80 -1 for default (order by other properties),
81 -2 or smaller for less than default.
82 * quality Order number of the video quality of this
83 format, irrespective of the file format.
84 -1 for default (order by other properties),
85 -2 or smaller for less than default.
87 ext: Video filename extension.
88 format: The video format, defaults to ext (used for --get-format)
89 player_url: SWF Player URL (used for rtmpdump).
91 The following fields are optional:
93 display_id An alternative identifier for the video, not necessarily
94 unique, but available before title. Typically, id is
95 something like "4234987", title "Dancing naked mole rats",
96 and display_id "dancing-naked-mole-rats"
97 thumbnails: A list of dictionaries, with the following entries:
99 * "width" (optional, int)
100 * "height" (optional, int)
101 * "resolution" (optional, string "{width}x{height"},
103 thumbnail: Full URL to a video thumbnail image.
104 description: One-line video description.
105 uploader: Full name of the video uploader.
106 timestamp: UNIX timestamp of the moment the video became available.
107 upload_date: Video upload date (YYYYMMDD).
108 If not explicitly set, calculated from timestamp.
109 uploader_id: Nickname or id of the video uploader.
110 location: Physical location of the video.
111 subtitles: The subtitle file contents as a dictionary in the format
112 {language: subtitles}.
113 duration: Length of the video in seconds, as an integer.
114 view_count: How many users have watched the video on the platform.
115 like_count: Number of positive ratings of the video
116 dislike_count: Number of negative ratings of the video
117 comment_count: Number of comments on the video
118 age_limit: Age restriction for the video, as an integer (years)
119 webpage_url: The url to the video webpage, if given to youtube-dl it
120 should allow to get the same result again. (It will be set
121 by YoutubeDL if it's missing)
122 categories: A list of categories that the video falls in, for example
125 Unless mentioned otherwise, the fields should be Unicode strings.
127 Subclasses of this one should re-define the _real_initialize() and
128 _real_extract() methods and define a _VALID_URL regexp.
129 Probably, they should also be added to the list of extractors.
131 Finally, the _WORKING attribute should be set to False for broken IEs
132 in order to warn the users and skip the tests.
139 def __init__(self, downloader=None):
140 """Constructor. Receives an optional downloader."""
142 self.set_downloader(downloader)
145 def suitable(cls, url):
146 """Receives a URL and returns True if suitable for this IE."""
148 # This does not use has/getattr intentionally - we want to know whether
149 # we have cached the regexp for *this* class, whereas getattr would also
150 # match the superclass
151 if '_VALID_URL_RE' not in cls.__dict__:
152 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
153 return cls._VALID_URL_RE.match(url) is not None
157 """Getter method for _WORKING."""
160 def initialize(self):
161 """Initializes an instance (authentication, etc)."""
163 self._real_initialize()
166 def extract(self, url):
167 """Extracts URL information and returns it in list of dicts."""
169 return self._real_extract(url)
171 def set_downloader(self, downloader):
172 """Sets the downloader for this IE."""
173 self._downloader = downloader
175 def _real_initialize(self):
176 """Real initialization process. Redefine in subclasses."""
179 def _real_extract(self, url):
180 """Real extraction process. Redefine in subclasses."""
185 """A string for getting the InfoExtractor with get_info_extractor"""
186 return cls.__name__[:-2]
190 return type(self).__name__[:-2]
192 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
193 """ Returns the response handle """
195 self.report_download_webpage(video_id)
196 elif note is not False:
198 self.to_screen(u'%s' % (note,))
200 self.to_screen(u'%s: %s' % (video_id, note))
202 return self._downloader.urlopen(url_or_request)
203 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
207 errnote = u'Unable to download webpage'
208 errmsg = u'%s: %s' % (errnote, compat_str(err))
210 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
212 self._downloader.report_warning(errmsg)
215 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
216 """ Returns a tuple (page content as string, URL handle) """
218 # Strip hashes from the URL (#1038)
219 if isinstance(url_or_request, (compat_str, str)):
220 url_or_request = url_or_request.partition('#')[0]
222 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
226 content_type = urlh.headers.get('Content-Type', '')
227 webpage_bytes = urlh.read()
228 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
230 encoding = m.group(1)
232 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
233 webpage_bytes[:1024])
235 encoding = m.group(1).decode('ascii')
236 elif webpage_bytes.startswith(b'\xff\xfe'):
240 if self._downloader.params.get('dump_intermediate_pages', False):
242 url = url_or_request.get_full_url()
243 except AttributeError:
245 self.to_screen(u'Dumping request to ' + url)
246 dump = base64.b64encode(webpage_bytes).decode('ascii')
247 self._downloader.to_screen(dump)
248 if self._downloader.params.get('write_pages', False):
250 url = url_or_request.get_full_url()
251 except AttributeError:
253 basen = '%s_%s' % (video_id, url)
255 h = u'___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
256 basen = basen[:240 - len(h)] + h
257 raw_filename = basen + '.dump'
258 filename = sanitize_filename(raw_filename, restricted=True)
259 self.to_screen(u'Saving request to ' + filename)
260 with open(filename, 'wb') as outf:
261 outf.write(webpage_bytes)
264 content = webpage_bytes.decode(encoding, 'replace')
266 content = webpage_bytes.decode('utf-8', 'replace')
268 if (u'<title>Access to this site is blocked</title>' in content and
269 u'Websense' in content[:512]):
270 msg = u'Access to this webpage has been blocked by Websense filtering software in your network.'
271 blocked_iframe = self._html_search_regex(
272 r'<iframe src="([^"]+)"', content,
273 u'Websense information URL', default=None)
275 msg += u' Visit %s for more details' % blocked_iframe
276 raise ExtractorError(msg, expected=True)
278 return (content, urlh)
280 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
281 """ Returns the data of the page as a string """
282 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
289 def _download_xml(self, url_or_request, video_id,
290 note=u'Downloading XML', errnote=u'Unable to download XML',
291 transform_source=None, fatal=True):
292 """Return the xml as an xml.etree.ElementTree.Element"""
293 xml_string = self._download_webpage(
294 url_or_request, video_id, note, errnote, fatal=fatal)
295 if xml_string is False:
298 xml_string = transform_source(xml_string)
299 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
301 def _download_json(self, url_or_request, video_id,
302 note=u'Downloading JSON metadata',
303 errnote=u'Unable to download JSON metadata',
304 transform_source=None):
305 json_string = self._download_webpage(url_or_request, video_id, note, errnote)
307 json_string = transform_source(json_string)
309 return json.loads(json_string)
310 except ValueError as ve:
311 raise ExtractorError('Failed to download JSON', cause=ve)
313 def report_warning(self, msg, video_id=None):
314 idstr = u'' if video_id is None else u'%s: ' % video_id
315 self._downloader.report_warning(
316 u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
318 def to_screen(self, msg):
319 """Print msg to screen, prefixing it with '[ie_name]'"""
320 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
322 def report_extraction(self, id_or_name):
323 """Report information extraction."""
324 self.to_screen(u'%s: Extracting information' % id_or_name)
326 def report_download_webpage(self, video_id):
327 """Report webpage download."""
328 self.to_screen(u'%s: Downloading webpage' % video_id)
330 def report_age_confirmation(self):
331 """Report attempt to confirm age."""
332 self.to_screen(u'Confirming age')
334 def report_login(self):
335 """Report attempt to log in."""
336 self.to_screen(u'Logging in')
338 #Methods for following #608
340 def url_result(url, ie=None, video_id=None):
341 """Returns a url that points to a page that should be processed"""
342 #TODO: ie should be the class used for getting the info
343 video_info = {'_type': 'url',
346 if video_id is not None:
347 video_info['id'] = video_id
350 def playlist_result(entries, playlist_id=None, playlist_title=None):
351 """Returns a playlist"""
352 video_info = {'_type': 'playlist',
355 video_info['id'] = playlist_id
357 video_info['title'] = playlist_title
360 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
362 Perform a regex search on the given string, using a single or a list of
363 patterns returning the first matching group.
364 In case of failure return a default value or raise a WARNING or a
365 RegexNotFoundError, depending on fatal, specifying the field name.
367 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
368 mobj = re.search(pattern, string, flags)
371 mobj = re.search(p, string, flags)
374 if os.name != 'nt' and sys.stderr.isatty():
375 _name = u'\033[0;34m%s\033[0m' % name
380 # return the first matching group
381 return next(g for g in mobj.groups() if g is not None)
382 elif default is not _NO_DEFAULT:
385 raise RegexNotFoundError(u'Unable to extract %s' % _name)
387 self._downloader.report_warning(u'unable to extract %s; '
388 u'please report this issue on http://yt-dl.org/bug' % _name)
391 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
393 Like _search_regex, but strips HTML tags and unescapes entities.
395 res = self._search_regex(pattern, string, name, default, fatal, flags)
397 return clean_html(res).strip()
401 def _get_login_info(self):
403 Get the the login info as (username, password)
404 It will look in the netrc file using the _NETRC_MACHINE value
405 If there's no info available, return (None, None)
407 if self._downloader is None:
412 downloader_params = self._downloader.params
414 # Attempt to use provided username and password or .netrc data
415 if downloader_params.get('username', None) is not None:
416 username = downloader_params['username']
417 password = downloader_params['password']
418 elif downloader_params.get('usenetrc', False):
420 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
425 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
426 except (IOError, netrc.NetrcParseError) as err:
427 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
429 return (username, password)
431 # Helper functions for extracting OpenGraph info
433 def _og_regexes(prop):
434 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
435 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
436 template = r'<meta[^>]+?%s[^>]+?%s'
438 template % (property_re, content_re),
439 template % (content_re, property_re),
442 def _og_search_property(self, prop, html, name=None, **kargs):
444 name = 'OpenGraph %s' % prop
445 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
448 return unescapeHTML(escaped)
450 def _og_search_thumbnail(self, html, **kargs):
451 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
453 def _og_search_description(self, html, **kargs):
454 return self._og_search_property('description', html, fatal=False, **kargs)
456 def _og_search_title(self, html, **kargs):
457 return self._og_search_property('title', html, **kargs)
459 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
460 regexes = self._og_regexes('video')
461 if secure: regexes = self._og_regexes('video:secure_url') + regexes
462 return self._html_search_regex(regexes, html, name, **kargs)
464 def _og_search_url(self, html, **kargs):
465 return self._og_search_property('url', html, **kargs)
467 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
468 if display_name is None:
470 return self._html_search_regex(
472 (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
473 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
474 html, display_name, fatal=fatal, **kwargs)
476 def _dc_search_uploader(self, html):
477 return self._html_search_meta('dc.creator', html, 'uploader')
479 def _rta_search(self, html):
480 # See http://www.rtalabel.org/index.php?content=howtofaq#single
481 if re.search(r'(?ix)<meta\s+name="rating"\s+'
482 r' content="RTA-5042-1996-1400-1577-RTA"',
487 def _media_rating_search(self, html):
488 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
489 rating = self._html_search_meta('rating', html)
501 return RATING_TABLE.get(rating.lower(), None)
503 def _twitter_search_player(self, html):
504 return self._html_search_meta('twitter:player', html,
505 'twitter card player')
507 def _sort_formats(self, formats):
509 raise ExtractorError(u'No video formats found')
512 # TODO remove the following workaround
513 from ..utils import determine_ext
514 if not f.get('ext') and 'url' in f:
515 f['ext'] = determine_ext(f['url'])
517 preference = f.get('preference')
518 if preference is None:
519 proto = f.get('protocol')
521 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
523 preference = 0 if proto in ['http', 'https'] else -0.1
524 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
527 if f.get('vcodec') == 'none': # audio only
528 if self._downloader.params.get('prefer_free_formats'):
529 ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
531 ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
534 audio_ext_preference = ORDER.index(f['ext'])
536 audio_ext_preference = -1
538 if self._downloader.params.get('prefer_free_formats'):
539 ORDER = [u'flv', u'mp4', u'webm']
541 ORDER = [u'webm', u'flv', u'mp4']
543 ext_preference = ORDER.index(f['ext'])
546 audio_ext_preference = 0
550 f.get('quality') if f.get('quality') is not None else -1,
551 f.get('height') if f.get('height') is not None else -1,
552 f.get('width') if f.get('width') is not None else -1,
554 f.get('tbr') if f.get('tbr') is not None else -1,
555 f.get('vbr') if f.get('vbr') is not None else -1,
556 f.get('abr') if f.get('abr') is not None else -1,
557 audio_ext_preference,
558 f.get('filesize') if f.get('filesize') is not None else -1,
559 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
562 formats.sort(key=_formats_key)
564 def http_scheme(self):
565 """ Either "https:" or "https:", depending on the user's preferences """
568 if self._downloader.params.get('prefer_insecure', False)
571 def _proto_relative_url(self, url, scheme=None):
574 if url.startswith('//'):
576 scheme = self.http_scheme()
581 def _sleep(self, timeout, video_id, msg_template=None):
582 if msg_template is None:
583 msg_template = u'%(video_id)s: Waiting for %(timeout)s seconds'
584 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
589 class SearchInfoExtractor(InfoExtractor):
591 Base class for paged search queries extractors.
592 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
593 Instances should define _SEARCH_KEY and _MAX_RESULTS.
597 def _make_valid_url(cls):
598 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
601 def suitable(cls, url):
602 return re.match(cls._make_valid_url(), url) is not None
604 def _real_extract(self, query):
605 mobj = re.match(self._make_valid_url(), query)
607 raise ExtractorError(u'Invalid search query "%s"' % query)
609 prefix = mobj.group('prefix')
610 query = mobj.group('query')
612 return self._get_n_results(query, 1)
613 elif prefix == 'all':
614 return self._get_n_results(query, self._MAX_RESULTS)
618 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
619 elif n > self._MAX_RESULTS:
620 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
621 n = self._MAX_RESULTS
622 return self._get_n_results(query, n)
624 def _get_n_results(self, query, n):
625 """Get a specified number of results for a query"""
626 raise NotImplementedError("This method must be implemented by subclasses")
629 def SEARCH_KEY(self):
630 return self._SEARCH_KEY