tinymist_std/adt/fmap.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
//! A map that shards items by their fingerprint.
use std::{collections::HashMap, num::NonZeroU32};
use crate::hash::Fingerprint;
/// A global upper bound on the shard size.
/// If there are too many shards, the memory overhead is unacceptable.
const MAX_SHARD_SIZE: u32 = 512;
/// Returns a read-only default shard size.
fn default_shard_size() -> NonZeroU32 {
static ITEM_SHARD_SIZE: std::sync::OnceLock<NonZeroU32> = std::sync::OnceLock::new();
/// By testing, we found that the optimal shard size is 2 * number of
/// threads.
fn determine_default_shard_size() -> NonZeroU32 {
// This detection is from rayon.
let thread_cnt = {
std::thread::available_parallelism()
.map(|n| n.get())
.unwrap_or(1)
};
// A valid shard size is a power of two.
let size = (thread_cnt.next_power_of_two() * 2) as u32;
// Perform early non-zero check to avoid panics.
NonZeroU32::new(size.min(MAX_SHARD_SIZE)).unwrap()
}
*ITEM_SHARD_SIZE.get_or_init(determine_default_shard_size)
}
type FMapBase<V> = parking_lot::RwLock<HashMap<Fingerprint, V>>;
/// A map that shards items by their fingerprint. This is faster
/// than the dashmap in some cases.
///
/// It is fast since a fingerprint could split items into different shards
/// efficiently.
///
/// Note: If a fingerprint is not calculated from a hash function, it is not
/// guaranteed that the fingerprint is evenly distributed. Thus, in that case,
/// the performance of this map is not guaranteed.
pub struct FingerprintMap<V> {
mask: u32,
shards: Vec<parking_lot::RwLock<HashMap<Fingerprint, V>>>,
}
impl<V> Default for FingerprintMap<V> {
fn default() -> Self {
Self::new(default_shard_size())
}
}
impl<V> FingerprintMap<V> {
/// Creates a new `FingerprintMap` with the given shard size.
pub fn new(shard_size: NonZeroU32) -> Self {
let shard_size = shard_size.get().next_power_of_two();
let shard_size = shard_size.min(MAX_SHARD_SIZE);
assert!(
shard_size.is_power_of_two(),
"shard size must be a power of two"
);
assert!(shard_size > 0, "shard size must be greater than zero");
Self {
mask: shard_size - 1,
shards: (0..shard_size)
.map(|_| parking_lot::RwLock::new(HashMap::new()))
.collect(),
}
}
/// Iterates over all items in the map.
pub fn into_items(self) -> impl Iterator<Item = (Fingerprint, V)> {
self.shards
.into_iter()
.flat_map(|shard| shard.into_inner().into_iter())
}
/// Gets the shard
pub fn shard(&self, fg: Fingerprint) -> &FMapBase<V> {
let shards = &self.shards;
let route_idx = (fg.lower32() & self.mask) as usize;
// check that the route index is within the bounds of the shards
debug_assert!(route_idx < shards.len());
// SAFETY: `fg` is a valid index into `shards`, as shards size is never changed
// and mask is always a power of two.
unsafe { shards.get_unchecked(route_idx) }
}
/// Gets the mutable shard slice useful for parallel iteration
pub fn as_mut_slice(&mut self) -> &mut [FMapBase<V>] {
&mut self.shards
}
/// Checks if the map is empty.
pub fn contains_key(&self, fg: &Fingerprint) -> bool {
self.shard(*fg).read().contains_key(fg)
}
}
#[cfg(test)]
mod tests {
#[test]
fn test_default_shard_size() {
let size = super::default_shard_size().get();
eprintln!("size = {size}");
assert!(size > 0);
assert_eq!(size & (size - 1), 0);
}
}