X-Git-Url: http://git.bitcoin.ninja/index.cgi?p=dnsseed-rust;a=blobdiff_plain;f=src%2Fbloom.rs;fp=src%2Fbloom.rs;h=725587644bfec92a280f20c2c8ff0b6f7f26d430;hp=0000000000000000000000000000000000000000;hb=c06ea88b377bd782c96d74028a11d8595112f272;hpb=92cc9dbd228d9d881cb825c14ed4deb49854cb6e diff --git a/src/bloom.rs b/src/bloom.rs new file mode 100644 index 0000000..7255876 --- /dev/null +++ b/src/bloom.rs @@ -0,0 +1,143 @@ +use std::collections::hash_map::RandomState; +use std::hash::{BuildHasher, Hash, Hasher}; +use std::time::{Duration, Instant}; +use std::marker::PhantomData; + +// Constants for roughly 1 in 1 million fp with 18m entries +/// Number of entries in the filter (each 4 bits). 256MiB in total. +const FILTER_SIZE: usize = 64 * 1024 * 1024 * 8; +const HASHES: usize = 27; +const ROLL_COUNT: usize = 1_240_000; +#[cfg(test)] +const GENERATION_BITS: usize = 2; +#[cfg(not(test))] +const GENERATION_BITS: usize = 4; +pub const GENERATION_COUNT: usize = (1 << GENERATION_BITS) - 1; +const ELEMENTS_PER_BYTE: usize = 8 / GENERATION_BITS; + +pub struct RollingBloomFilter { + last_roll: Instant, + inserted_in_last_generations: [usize; GENERATION_COUNT - 1], + inserted_since_last_roll: usize, + current_generation: u8, + bits: Vec, + hash_keys: [RandomState; HASHES], + _entry_type: PhantomData, +} + +impl RollingBloomFilter { + pub fn new() -> Self { + let mut bits = Vec::new(); + bits.resize(FILTER_SIZE * GENERATION_BITS / 8, 0); + Self { + last_roll: Instant::now(), + inserted_since_last_roll: 0, + inserted_in_last_generations: [0; GENERATION_COUNT - 1], + current_generation: 1, + bits, + hash_keys: [RandomState::new(), RandomState::new(), RandomState::new(), RandomState::new(), RandomState::new(), + RandomState::new(), RandomState::new(), RandomState::new(), RandomState::new(), RandomState::new(), + RandomState::new(), RandomState::new(), RandomState::new(), RandomState::new(), RandomState::new(), + RandomState::new(), RandomState::new(), RandomState::new(), RandomState::new(), RandomState::new(), + RandomState::new(), RandomState::new(), RandomState::new(), RandomState::new(), RandomState::new(), + RandomState::new(), RandomState::new()], + _entry_type: PhantomData, + } + } + + pub fn contains(&self, item: &T) -> bool { + for state in self.hash_keys.iter() { + let mut hasher = state.build_hasher(); + item.hash(&mut hasher); + let idx = hasher.finish() as usize; + + let byte = self.bits[(idx / ELEMENTS_PER_BYTE) % (FILTER_SIZE / 8)]; + let bits_shift = (idx % ELEMENTS_PER_BYTE) * GENERATION_BITS; + let bits = (byte & ((GENERATION_COUNT as u8) << bits_shift)) >> bits_shift; + if bits == 0 { return false; } + } + true + } + + pub fn get_element_count(&self) -> [usize; GENERATION_COUNT] { + let mut res = [0; GENERATION_COUNT]; + res[0..(GENERATION_COUNT-1)].copy_from_slice(&self.inserted_in_last_generations); + *res.last_mut().unwrap() = self.inserted_since_last_roll; + res + } + + pub fn insert(&mut self, item: &T, roll_duration: Duration) { + if Instant::now() - self.last_roll > roll_duration / GENERATION_COUNT as u32 || + self.inserted_since_last_roll > ROLL_COUNT { + self.current_generation += 1; + if self.current_generation == GENERATION_COUNT as u8 + 1 { self.current_generation = 1; } + let remove_generation = self.current_generation; + + for idx in 0..FILTER_SIZE { + let byte = &mut self.bits[(idx / ELEMENTS_PER_BYTE) % (FILTER_SIZE / 8)]; + let bits_shift = (idx % ELEMENTS_PER_BYTE) * GENERATION_BITS; + let bits = (*byte & ((GENERATION_COUNT as u8) << bits_shift)) >> bits_shift; + + if bits == remove_generation { + *byte &= !((GENERATION_COUNT as u8) << bits_shift); + } + } + self.last_roll = Instant::now(); + let mut new_generations = [0; GENERATION_COUNT - 1]; + new_generations[0..GENERATION_COUNT - 2].copy_from_slice(&self.inserted_in_last_generations[1..]); + new_generations[GENERATION_COUNT - 2] = self.inserted_since_last_roll; + self.inserted_in_last_generations = new_generations; + self.inserted_since_last_roll = 0; + } + + for state in self.hash_keys.iter() { + let mut hasher = state.build_hasher(); + item.hash(&mut hasher); + let idx = hasher.finish() as usize; + + let byte = &mut self.bits[(idx / ELEMENTS_PER_BYTE) % (FILTER_SIZE / 8)]; + let bits_shift = (idx % ELEMENTS_PER_BYTE) * GENERATION_BITS; + *byte &= !((GENERATION_COUNT as u8) << bits_shift); + *byte |= self.current_generation << bits_shift; + } + self.inserted_since_last_roll += 1; + } +} + +#[test] +fn test_bloom() { + let mut filter = RollingBloomFilter::new(); + for i in 0..1000 { + filter.insert(&i, Duration::from_secs(60 * 60 * 24)); + } + for i in 0..1000 { + assert!(filter.contains(&i)); + } + for i in 1000..2000 { + assert!(!filter.contains(&i)); + } + assert_eq!(filter.get_element_count(), [0, 0, 1000]); + filter.inserted_since_last_roll = ROLL_COUNT + 1; + filter.insert(&1000, Duration::from_secs(60 * 60 * 24)); + assert_eq!(filter.get_element_count(), [0, ROLL_COUNT + 1, 1]); + for i in 0..1001 { + assert!(filter.contains(&i)); + } + filter.inserted_since_last_roll = ROLL_COUNT + 1; + for i in 1001..2000 { + filter.insert(&i, Duration::from_secs(60 * 60 * 24)); + } + assert_eq!(filter.get_element_count(), [ROLL_COUNT + 1, ROLL_COUNT + 1, 999]); + for i in 0..2000 { + assert!(filter.contains(&i)); + } + filter.inserted_since_last_roll = ROLL_COUNT + 1; + filter.insert(&2000, Duration::from_secs(60 * 60 * 24)); + assert_eq!(filter.get_element_count(), [ROLL_COUNT + 1, ROLL_COUNT + 1, 1]); + for i in 0..1000 { + assert!(!filter.contains(&i)); + } + for i in 1000..2001 { + assert!(filter.contains(&i)); + } +}