From 440ac8e00e87d62ea2f71563526b8ac7f11ee2f5 Mon Sep 17 00:00:00 2001 From: Emir Vildanov <e.vildanov@picodata.io> Date: Thu, 30 Mar 2023 06:56:38 +0000 Subject: [PATCH] feat: add new Statistics trait for sbroad engines --- sbroad-benches/src/engine.rs | 86 ++++-- sbroad-cartridge/src/cartridge/router.rs | 42 ++- sbroad-core/src/cbo.rs | 71 ++++- sbroad-core/src/cbo/histogram.rs | 84 +++-- sbroad-core/src/executor.rs | 2 +- sbroad-core/src/executor/engine.rs | 85 ++++++ sbroad-core/src/executor/engine/mock.rs | 374 +++++++++++++++++++++-- 7 files changed, 658 insertions(+), 86 deletions(-) diff --git a/sbroad-benches/src/engine.rs b/sbroad-benches/src/engine.rs index 7e7a028db4..ff086ee37e 100644 --- a/sbroad-benches/src/engine.rs +++ b/sbroad-benches/src/engine.rs @@ -1,13 +1,15 @@ use std::any::Any; use std::cell::{Ref, RefCell}; use std::collections::HashMap; +use std::rc::Rc; use sbroad::backend::sql::tree::{OrderedSyntaxNodes, SyntaxPlan}; +use sbroad::cbo::{TableColumnPair, TableStats}; use sbroad::errors::{Action, Entity, SbroadError}; use sbroad::executor::bucket::Buckets; use sbroad::executor::engine::{ normalize_name_from_sql, sharding_keys_from_map, sharding_keys_from_tuple, Configuration, - Coordinator, CoordinatorMetadata, + Coordinator, CoordinatorMetadata, InitialColumnStats, Statistics, }; use sbroad::executor::hash::bucket_id_by_tuple; use sbroad::executor::ir::ExecutionPlan; @@ -31,13 +33,6 @@ pub struct RouterConfigurationMock { } impl CoordinatorMetadata for RouterConfigurationMock { - /// Get Table by its name that contains: - /// * list of the columns, - /// * distribution key of the output tuples (column positions), - /// * table name. - /// - /// # Errors - /// - Failed to get table by name from the metadata. fn get_table_segment(&self, table_name: &str) -> Result<Table, SbroadError> { let name = normalize_name_from_sql(table_name); match self.tables.get(&name) { @@ -406,9 +401,34 @@ impl Configuration for RouterRuntimeMock { } } +impl Default for RouterRuntimeMock { + fn default() -> Self { + Self::new() + } +} + +impl RouterRuntimeMock { + #[allow(dead_code)] + #[allow(clippy::missing_panics_doc)] + #[must_use] + pub fn new() -> Self { + let cache: LRUCache<String, Plan> = LRUCache::new(DEFAULT_CAPACITY, None).unwrap(); + RouterRuntimeMock { + metadata: RefCell::new(RouterConfigurationMock::new()), + virtual_tables: HashMap::new(), + ir_cache: RefCell::new(cache), + } + } + + #[allow(dead_code)] + pub fn add_virtual_table(&mut self, id: usize, table: VirtualTable) { + self.virtual_tables.insert(id, table); + } +} + impl Coordinator for RouterRuntimeMock { - type ParseTree = AbstractSyntaxTree; type Cache = LRUCache<String, Plan>; + type ParseTree = AbstractSyntaxTree; fn clear_ir_cache(&self) -> Result<(), SbroadError> { *self.ir_cache.borrow_mut() = Self::Cache::new(DEFAULT_CAPACITY, None)?; @@ -475,27 +495,39 @@ impl Coordinator for RouterRuntimeMock { } } -impl Default for RouterRuntimeMock { - fn default() -> Self { - Self::new() +impl Statistics for RouterRuntimeMock { + #[allow(unused_variables)] + fn get_table_stats(&self, table_name: String) -> Result<Rc<TableStats>, SbroadError> { + // Will be added later. + todo!() } -} -impl RouterRuntimeMock { - #[allow(dead_code)] - #[allow(clippy::missing_panics_doc)] - #[must_use] - pub fn new() -> Self { - let cache: LRUCache<String, Plan> = LRUCache::new(DEFAULT_CAPACITY, None).unwrap(); - RouterRuntimeMock { - metadata: RefCell::new(RouterConfigurationMock::new()), - virtual_tables: HashMap::new(), - ir_cache: RefCell::new(cache), - } + #[allow(unused_variables)] + fn get_initial_column_stats( + &self, + table_column_pair: TableColumnPair, + ) -> Result<Rc<InitialColumnStats>, SbroadError> { + // Will be added later. + todo!() } - #[allow(dead_code)] - pub fn add_virtual_table(&mut self, id: usize, table: VirtualTable) { - self.virtual_tables.insert(id, table); + #[allow(unused_variables)] + fn update_table_stats_cache( + &mut self, + table_name: String, + table_stats: TableStats, + ) -> Result<(), SbroadError> { + // Will be added later. + todo!() + } + + #[allow(unused_variables)] + fn update_column_initial_stats_cache( + &self, + table_column_pair: TableColumnPair, + initial_column_stats: InitialColumnStats, + ) -> Result<(), SbroadError> { + // Will be added later. + todo!() } } diff --git a/sbroad-cartridge/src/cartridge/router.rs b/sbroad-cartridge/src/cartridge/router.rs index d25b67e5c3..6f3243de93 100644 --- a/sbroad-cartridge/src/cartridge/router.rs +++ b/sbroad-cartridge/src/cartridge/router.rs @@ -19,11 +19,12 @@ use crate::cartridge::config::RouterConfiguration; use crate::cartridge::update_tracing; use sbroad::backend::sql::tree::{OrderedSyntaxNodes, SyntaxPlan}; +use sbroad::cbo::{TableColumnPair, TableStats}; use sbroad::errors::{Action, Entity, SbroadError}; use sbroad::executor::bucket::Buckets; use sbroad::executor::engine::{ normalize_name_from_schema, sharding_keys_from_map, sharding_keys_from_tuple, Configuration, - Coordinator, CoordinatorMetadata, + Coordinator, CoordinatorMetadata, InitialColumnStats, Statistics, }; use sbroad::executor::hash::bucket_id_by_tuple; use sbroad::executor::ir::{ConnectionType, ExecutionPlan, QueryType}; @@ -267,8 +268,8 @@ impl RouterRuntime { } impl Coordinator for RouterRuntime { - type ParseTree = AbstractSyntaxTree; type Cache = LRUCache<String, Plan>; + type ParseTree = AbstractSyntaxTree; fn clear_ir_cache(&self) -> Result<(), SbroadError> { *self.ir_cache.try_borrow_mut().map_err(|e| { @@ -397,6 +398,43 @@ impl Coordinator for RouterRuntime { } } +impl Statistics for RouterRuntime { + #[allow(unused_variables)] + fn get_table_stats(&self, table_name: String) -> Result<Rc<TableStats>, SbroadError> { + // Will be added later. + todo!() + } + + #[allow(unused_variables)] + fn get_initial_column_stats( + &self, + table_column_pair: TableColumnPair, + ) -> Result<Rc<InitialColumnStats>, SbroadError> { + // Will be added later. + todo!() + } + + #[allow(unused_variables)] + fn update_table_stats_cache( + &mut self, + table_name: String, + table_stats: TableStats, + ) -> Result<(), SbroadError> { + // Will be added later. + todo!() + } + + #[allow(unused_variables)] + fn update_column_initial_stats_cache( + &self, + table_column_pair: TableColumnPair, + initial_column_stats: InitialColumnStats, + ) -> Result<(), SbroadError> { + // Will be added later. + todo!() + } +} + impl RouterRuntime { /// Create new Tarantool cartridge runtime. /// diff --git a/sbroad-core/src/cbo.rs b/sbroad-core/src/cbo.rs index 41350b1e57..0f55f8b677 100644 --- a/sbroad-core/src/cbo.rs +++ b/sbroad-core/src/cbo.rs @@ -1,6 +1,11 @@ //! Cost Based Optimizer. //! //! Module used to optimize IR tree using statistics and plan cost calculation algorithms. +//! +//! As soon as the biggest part of the logic is taken from +//! `PostgreSQL` implementation, you may see `PostgreSQL lines` comments +//! in some places with indication of function names and corresponding lines of code. +//! `PostgreSQL` version: `REL_15_2`. use crate::cbo::histogram::Histogram; use crate::errors::{Entity, SbroadError}; @@ -9,7 +14,7 @@ use std::collections::HashMap; /// Struct representing statistics for the whole table. #[derive(Debug, Clone, PartialEq)] -pub(crate) struct TableStats { +pub struct TableStats { /// Table name. table_name: String, /// Number of rows in the table. @@ -26,6 +31,25 @@ pub(crate) struct TableStats { remove_counter: u32, } +impl TableStats { + #[must_use] + pub fn new( + table_name: String, + rows_number: u64, + insert_counter: u32, + update_counter: u32, + remove_counter: u32, + ) -> Self { + Self { + table_name, + rows_number, + insert_counter, + update_counter, + remove_counter, + } + } +} + /// Struct representing statistics for column. /// /// May represent transformed statistics, appeared during application @@ -44,13 +68,13 @@ pub(crate) struct ColumnStats<'col_stats> { /// Number of elements in the column. /// /// Note, that the field is filled only ofter `TableStats` for the column table is retrieved. - elements_count: usize, + rows_number: usize, /// Min value in the column. min_value: &'col_stats Value, /// Max value in the column. max_value: &'col_stats Value, /// Average size of column row in bytes. - avg_value_size: u64, + avg_size: u64, /// Compressed histogram (equi-height histogram with mcv array). /// /// May have no values inside (`elements_count` field equal to 0) @@ -58,6 +82,37 @@ pub(crate) struct ColumnStats<'col_stats> { histogram: &'col_stats Histogram<'col_stats>, } +#[allow(dead_code)] +impl<'column_stats> ColumnStats<'column_stats> { + #[must_use] + pub fn new( + elements_count: usize, + min_value: &'column_stats Value, + max_value: &'column_stats Value, + avg_value_size: u64, + histogram: &'column_stats Histogram, + ) -> Self { + Self { + rows_number: elements_count, + min_value, + max_value, + avg_size: avg_value_size, + histogram, + } + } +} + +// Alias for pair of table name and column id in the table. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct TableColumnPair(String, usize); + +#[allow(dead_code)] +impl TableColumnPair { + pub(crate) fn new(table_name: String, column_id: usize) -> Self { + Self(table_name, column_id) + } +} + /// Structure for global optimizations /// that contains whole statistics information /// which may be useful for optimization. @@ -69,7 +124,7 @@ pub(crate) struct CostBasedOptimizer<'cbo> { /// that originates from `Scan` nodes during traversal of IR relational operators tree. /// Used in `calculate_cost` function in the `Scan` node in order to retrieve stats for /// requested columns. - initial_column_stats: HashMap<(String, String), ColumnStats<'cbo>>, + initial_column_stats: HashMap<TableColumnPair, ColumnStats<'cbo>>, /// Vector of `Histogram` structures. /// Initially it's filled with histograms gathered from storages. /// It's updated with new histograms during the statistics transformation process: @@ -95,19 +150,19 @@ impl<'cbo> CostBasedOptimizer<'cbo> { /// Get `initial_column_stats` map. #[cfg(test)] - fn get_initial_column_stats(&self) -> &HashMap<(String, String), ColumnStats> { + fn get_initial_column_stats(&self) -> &HashMap<TableColumnPair, ColumnStats> { &self.initial_column_stats } /// Get value from `initial_column_stats` map by `key` - fn get_from_initial_column_stats(&self, key: &(String, String)) -> Option<&ColumnStats> { + fn get_from_initial_column_stats(&self, key: &TableColumnPair) -> Option<&ColumnStats> { self.initial_column_stats.get(key) } /// Add new initial column stats to the `initial_column_stats` map. fn update_initial_column_stats( &'cbo mut self, - key: (String, String), + key: TableColumnPair, stats: ColumnStats<'cbo>, ) -> Option<ColumnStats> { self.initial_column_stats.insert(key, stats) @@ -141,4 +196,4 @@ impl<'cbo> CostBasedOptimizer<'cbo> { } } -mod histogram; +pub mod histogram; diff --git a/sbroad-core/src/cbo/histogram.rs b/sbroad-core/src/cbo/histogram.rs index f3e2f84dc1..a67ced758c 100644 --- a/sbroad-core/src/cbo/histogram.rs +++ b/sbroad-core/src/cbo/histogram.rs @@ -4,34 +4,66 @@ //! CBO algorithms. use crate::errors::{Entity, SbroadError}; -use crate::ir::value::{value_to_decimal_or_error, Value}; +use crate::ir::value::{value_to_decimal_or_error, TrivalentOrdering, Value}; use itertools::enumerate; +use std::fmt::Debug; use std::str::FromStr; /// Helper structure that represents pair of most common value in the column and its frequency. #[derive(Debug, PartialEq, Clone)] -struct MostCommonValueWithFrequency { +pub struct MostCommonValueWithFrequency { value: Value, frequency: f64, } impl MostCommonValueWithFrequency { #[allow(dead_code)] - fn new(value: Value, frequency: f64) -> Self { + pub(crate) fn new(value: Value, frequency: f64) -> Self { MostCommonValueWithFrequency { value, frequency } } } /// Representation of histogram bucket. -#[derive(Clone, Debug, PartialEq)] -struct Bucket<'bucket> { - /// From (left border) value of the bucket (not inclusive, except for the first bucket) - pub from: &'bucket Value, - /// To (right order) value of the bucket (inclusive) - pub to: &'bucket Value, - /// Bucket frequency. - /// Represents the number of elements stored in the bucket. - pub frequency: usize, +/// Fields: +/// `from` -- left border value of the bucket. +/// `to` -- right border value of the bucket (always inclusive). +/// `frequency` -- number of elements stored in the bucket. +#[derive(PartialEq, Debug, Clone)] +#[allow(dead_code)] +enum Bucket<'bucket> { + /// Representation of the first histogram bucket with inclusive `from` edge. + First { + from: &'bucket Value, + to: &'bucket Value, + frequency: usize, + }, + /// Representation of a non-first histogram bucket with non-inclusive `from` edge. + NonFirst { + from: &'bucket Value, + to: &'bucket Value, + frequency: usize, + }, +} + +/// Checks whether given value falls into the bucket. +/// +/// Returns `None` in case inner call to `partial_cmp` resulted to None. +#[allow(dead_code)] +fn value_falls_into_bucket(bucket: &Bucket, value: &Value) -> Option<bool> { + let (from, to, is_first) = match bucket { + Bucket::First { from, to, .. } => (from, to, true), + Bucket::NonFirst { from, to, .. } => (from, to, false), + }; + let from_partial_cmp = value.partial_cmp(from)?; + let to_partial_cmp = value.partial_cmp(to)?; + if (TrivalentOrdering::Greater == from_partial_cmp + || (is_first && TrivalentOrdering::Equal == from_partial_cmp)) + && (TrivalentOrdering::Less == to_partial_cmp || TrivalentOrdering::Equal == to_partial_cmp) + { + Some(true) + } else { + Some(false) + } } /// Representation of equi-height histogram. @@ -39,12 +71,9 @@ struct Bucket<'bucket> { /// It's assumed that if the histogram is present, then all /// its fields are filled. /// -/// As soon as the biggest part of the logic is taken from -/// `PostgreSQL` implementation, you may see `PostgreSQL lines` comments -/// in some places. It means you can find -/// implementation of `PostgreSQL` logic by searching the provided text. -/// -/// `PostgreSQL` version: `REL_15_2` +/// **Note**: We don't keep the number of rows stored in the corresponding column during the process +/// of histogram creation in order to support cases of table size change. We always take the +/// information about `rows_number` from `ColumnStats` of corresponding column. #[derive(Debug, PartialEq, Clone)] pub struct Histogram<'histogram> { // Most common values and their frequencies. @@ -59,20 +88,15 @@ pub struct Histogram<'histogram> { /// * ... /// * i = n -> (b_(n-2); b_(n-1)] buckets: Vec<Bucket<'histogram>>, - /// Fraction of NULL values among all column values. + /// Fraction of NULL values among all column rows. + /// Always positive value from 0 to 1. null_fraction: f64, - /// Number of distinct values for the whole histogram. - /// - /// **Note**: It is easy during the histogram calculation - /// phase to calculate ndv as soon as the elements have to be sorted - /// in order to construct bucket_bounds Vec. - ndv: usize, - /// Number of elements added into histogram. + /// Number of distinct values divided by the number of rows. + /// Always positive value from 0 to 1. /// - /// **Note**: the number of values added into histogram don't - /// have to be equal to the number of rows in the table as soon as - /// some rows might have been added after the histogram was created. - elements_count: usize, + /// **Note**: in order to calculate `number_of_distinct_values` (absolute value) we must + /// use formula `rows_number * (1 - null_fraction) * distinct_values_fraction` + distinct_values_fraction: f64, } /// Helper structure that represents `String` char sequence. diff --git a/sbroad-core/src/executor.rs b/sbroad-core/src/executor.rs index cf62ebf3d9..c6d8777e22 100644 --- a/sbroad-core/src/executor.rs +++ b/sbroad-core/src/executor.rs @@ -109,8 +109,8 @@ where plan = cached_plan.clone(); } if plan.is_empty() { - let ast = C::ParseTree::new(sql)?; let metadata = &*coordinator.cached_config()?; + let ast = C::ParseTree::new(sql)?; plan = ast.resolve_metadata(metadata)?; cache.put(key, plan.clone())?; } diff --git a/sbroad-core/src/executor/engine.rs b/sbroad-core/src/executor/engine.rs index 9938502d9d..f790fcd66a 100644 --- a/sbroad-core/src/executor/engine.rs +++ b/sbroad-core/src/executor/engine.rs @@ -2,10 +2,13 @@ //! //! Traits that define an execution engine interface. +use crate::cbo::histogram::MostCommonValueWithFrequency; +use crate::cbo::{TableColumnPair, TableStats}; use std::any::Any; use std::cell::{Ref, RefCell}; use std::cmp::Ordering; use std::collections::HashMap; +use std::rc::Rc; use crate::errors::{Entity, SbroadError}; use crate::executor::bucket::Buckets; @@ -161,6 +164,88 @@ pub trait Coordinator: Configuration { fn determine_bucket_id(&self, s: &[&Value]) -> u64; } +/// Enum that represents initial bucket gathered from storages. +/// It copies `Bucket` enum, where all field are represented by value and not by reference. +#[allow(dead_code)] +#[derive(Debug, Clone)] +enum InitialBucket { + /// Representation of the first histogram bucket with inclusive `from` edge. + First { + from: Value, + to: Value, + frequency: usize, + }, + /// Representation of a non-first histogram bucket with non-inclusive `from` edge. + NonFirst { + from: Value, + to: Value, + frequency: usize, + }, +} + +/// Struct that represents initial histogram gathered from storages. +/// It copies `Histogram` structure, where all field are represented by value and not by reference. +#[allow(dead_code)] +#[derive(Debug, Clone)] +struct InitialHistogram { + most_common: Vec<MostCommonValueWithFrequency>, + buckets: Vec<InitialBucket>, + null_fraction: f64, + distinct_values_fraction: f64, +} + +/// Struct that represents initial statistics gathered from storages. +/// It copies `ColumnStats` structure, where all fields are represented by value and not by reference. +/// +/// **Note**: `rows_number` field is missing, because during `ColumnStats` +/// structure initialization this information will be passed from `TableStats`. +#[allow(dead_code)] +#[derive(Debug, Clone)] +pub struct InitialColumnStats { + min_value: Value, + max_value: Value, + avg_size: u64, + histogram: InitialHistogram, +} + +/// A `CostBased` statistics trait. +pub trait Statistics { + /// Get `TableStats` for table by its name from storages. + /// + /// # Errors + /// - Table statistics can not be gathered neither from the cache nor from the storages. + fn get_table_stats(&self, table_name: String) -> Result<Rc<TableStats>, SbroadError>; + + /// Get `InitialColumnStats` for column by its table name and column name from storages. + /// + /// # Errors + /// - Initial column statistics can not be gathered neither from the cache nor from the storages. + fn get_initial_column_stats( + &self, + table_column_pair: TableColumnPair, + ) -> Result<Rc<InitialColumnStats>, SbroadError>; + + /// Update `TableStats` cache with given table statistics. + /// + /// # Errors + /// - Table statistics couldn't be mutually borrowed. + fn update_table_stats_cache( + &mut self, + table_name: String, + table_stats: TableStats, + ) -> Result<(), SbroadError>; + + /// Update `InitialColumnStats` cache with given initial column statistics. + /// + /// # Errors + /// - Initial column statistics couldn't be mutually borrowed. + fn update_column_initial_stats_cache( + &self, + table_column_pair: TableColumnPair, + initial_column_stats: InitialColumnStats, + ) -> Result<(), SbroadError>; +} + /// A common function for all engines to calculate the sharding key value from a tuple. /// /// # Errors diff --git a/sbroad-core/src/executor/engine/mock.rs b/sbroad-core/src/executor/engine/mock.rs index 98b8d83297..ce85673fd4 100644 --- a/sbroad-core/src/executor/engine/mock.rs +++ b/sbroad-core/src/executor/engine/mock.rs @@ -1,14 +1,17 @@ use std::any::Any; use std::cell::{Ref, RefCell}; use std::collections::{HashMap, HashSet}; +use std::rc::Rc; use crate::backend::sql::tree::{OrderedSyntaxNodes, SyntaxPlan}; +use crate::cbo::histogram::MostCommonValueWithFrequency; +use crate::cbo::{TableColumnPair, TableStats}; use crate::collection; use crate::errors::{Action, Entity, SbroadError}; use crate::executor::bucket::Buckets; use crate::executor::engine::{ normalize_name_from_sql, sharding_keys_from_map, sharding_keys_from_tuple, Configuration, - Coordinator, + Coordinator, InitialBucket, InitialColumnStats, InitialHistogram, Statistics, }; use crate::executor::hash::bucket_id_by_tuple; use crate::executor::ir::ExecutionPlan; @@ -251,6 +254,8 @@ pub struct RouterRuntimeMock { metadata: RefCell<RouterConfigurationMock>, virtual_tables: RefCell<HashMap<usize, VirtualTable>>, ir_cache: RefCell<LRUCache<String, Plan>>, + table_statistics_cache: RefCell<HashMap<String, Rc<TableStats>>>, + initial_column_statistics_cache: RefCell<HashMap<TableColumnPair, Rc<InitialColumnStats>>>, } impl std::fmt::Debug for RouterRuntimeMock { @@ -258,6 +263,8 @@ impl std::fmt::Debug for RouterRuntimeMock { f.debug_tuple("") .field(&self.metadata) .field(&self.virtual_tables) + .field(&self.table_statistics_cache) + .field(&self.initial_column_statistics_cache) .finish() } } @@ -320,6 +327,277 @@ impl Configuration for RouterRuntimeMock { } } +impl Default for RouterRuntimeMock { + fn default() -> Self { + Self::new() + } +} + +impl RouterRuntimeMock { + #[allow(dead_code)] + #[allow(clippy::missing_panics_doc)] + #[must_use] + pub fn new() -> Self { + let cache: LRUCache<String, Plan> = LRUCache::new(DEFAULT_CAPACITY, None).unwrap(); + + let mut table_statistics_cache = HashMap::new(); + table_statistics_cache.insert( + "\"hash_testing_hist\"".to_string(), + Rc::new(TableStats::new( + "hash_testing_hist".to_string(), + 100, + 2100, + 1000, + 2000, + )), + ); + table_statistics_cache.insert( + "\"hash_testing\"".to_string(), + Rc::new(TableStats::new( + "hash_testing".to_string(), + 1000, + 1200, + 300, + 200, + )), + ); + table_statistics_cache.insert( + "\"test_space\"".to_string(), + Rc::new(TableStats::new( + "test_space".to_string(), + 2500, + 3000, + 1000, + 500, + )), + ); + + // Note that `rows_number` field in inserted column statistics must be equal to the + // `rows_number` field in the corresponding table. + let mut column_statistics_cache = HashMap::new(); + // Column statistics with empty histogram. + // + // * rows_number: 100 + // * min_value: 1 + // * max_value: 50 + // * avg_size: 4 + // * histogram: + // - null_fraction: 0.0 + // - most_common: [] + // - ndv (absolute value): 0 + // - buckets_count: 0 + // - buckets_frequency: 0 + // - buckets_boundaries: [] + column_statistics_cache.insert( + TableColumnPair::new("\"hash_testing_hist\"".to_string(), 0), + Rc::new(InitialColumnStats { + min_value: Value::Integer(1), + max_value: Value::Integer(50), + avg_size: 4, + histogram: InitialHistogram { + most_common: vec![], + buckets: vec![], + null_fraction: 0.0, + distinct_values_fraction: 0.0, + }, + }), + ); + + // Casual column statistics. + // + // Values `min_value` and `max_value` of `ColumnStats` structure are in fact + // displaying MIN and MAX values of `Histogram` structure (that is seen from its + // `most_common` and `buckets_boundaries` fields). + // An example of statistics, where general column statistics and histogram statistics conform. + // + // * rows_number: 1000 + // * min_value: 0 + // * max_value: 15 + // * avg_size: 8 + // * histogram: + // - null_fraction: 0.1 (100) + // - most_common: + // [0 -> 100, + // 1 -> 100, + // 2 -> 50, + // 3 -> 50, + // 4 -> 100] + // - ndv (absolute value): 15 (only 10 in buckets) + // - buckets_count: 5 + // - buckets_frequency: 100 (as only 500 elements are stored in buckets) + // - buckets_boundaries: [5, 7, 9, 11, 13, 15] + column_statistics_cache.insert( + TableColumnPair::new("\"hash_testing\"".to_string(), 0), + Rc::new(InitialColumnStats { + min_value: Value::Integer(0), + max_value: Value::Integer(15), + avg_size: 8, + histogram: InitialHistogram { + most_common: vec![ + MostCommonValueWithFrequency::new(Value::Integer(0), 100.0), + MostCommonValueWithFrequency::new(Value::Integer(1), 100.0), + MostCommonValueWithFrequency::new(Value::Integer(2), 50.0), + MostCommonValueWithFrequency::new(Value::Integer(3), 50.0), + MostCommonValueWithFrequency::new(Value::Integer(4), 100.0), + ], + buckets: vec![ + InitialBucket::First { + from: Value::Integer(5), + to: Value::Integer(7), + frequency: 100, + }, + InitialBucket::NonFirst { + from: Value::Integer(7), + to: Value::Integer(9), + frequency: 100, + }, + InitialBucket::NonFirst { + from: Value::Integer(9), + to: Value::Integer(11), + frequency: 100, + }, + InitialBucket::NonFirst { + from: Value::Integer(11), + to: Value::Integer(13), + frequency: 100, + }, + InitialBucket::NonFirst { + from: Value::Integer(13), + to: Value::Integer(15), + frequency: 100, + }, + ], + null_fraction: 0.1, + // 15 / (1000 * (1 - 0.1)) ~ 15 + distinct_values_fraction: 0.01666, + }, + }), + ); + // Column statistics with unique values. + // Note that it's also a column statistics with no `most_common` values. + // + // * rows_number: 1000 + // * min_value: 1 + // * max_value: 90 + // * avg_size: 4 + // * histogram: + // - null_fraction: 0.1 (100) + // - most_common: [] + // - ndv (absolute value): 900 + // - buckets_count: 3 + // - buckets_frequency: 300 (as all 900 left elements are stored in buckets) + // - buckets_boundaries: [1, 40, 65, 90] + column_statistics_cache.insert( + TableColumnPair::new("\"hash_testing\"".to_string(), 1), + Rc::new(InitialColumnStats { + min_value: Value::Integer(1), + max_value: Value::Integer(900), + avg_size: 4, + histogram: InitialHistogram { + most_common: vec![], + buckets: vec![ + InitialBucket::First { + from: Value::Integer(1), + to: Value::Integer(40), + frequency: 300, + }, + InitialBucket::NonFirst { + from: Value::Integer(40), + to: Value::Integer(65), + frequency: 300, + }, + InitialBucket::NonFirst { + from: Value::Integer(65), + to: Value::Integer(90), + frequency: 300, + }, + ], + null_fraction: 0.1, + // 900 / (1000 * (1 - 0.1)) = 1 + distinct_values_fraction: 1.0, + }, + }), + ); + // Casual column statistics, but for different column in different table. + // + // Values `min_value` and `max_value` of `ColumnStats` structure differ + // from MIN and MAX values that we can get from `Histogram` structure (that is seen from its + // `most_common` and `buckets_boundaries` fields). + // An example of statistics, where general column statistics and histogram statistics + // do NOT conform. That means histogram was gathered before updates to the corresponding table were made. + // + // * rows_number: 2500 + // * min_value: 1 + // * max_value: 2000 + // * avg_size: 8 + // * histogram: + // - null_fraction: 0.2 (500) + // - most_common: + // [3 -> 250, + // 4 -> 50, + // 5 -> 50, + // 6 -> 150] + // - ndv: 104 (only 100 in buckets) + // - buckets_count: 4 + // - buckets_frequency: 375 (as only 1500 elements are stored in buckets) + // - buckets_boundaries: [0, 78, 200, 780, 1800] + column_statistics_cache.insert( + TableColumnPair::new("\"test_space\"".to_string(), 0), + Rc::new(InitialColumnStats { + min_value: Value::Integer(1), + max_value: Value::Integer(2000), + avg_size: 8, + histogram: InitialHistogram { + most_common: vec![ + MostCommonValueWithFrequency::new(Value::Integer(3), 150.0), + MostCommonValueWithFrequency::new(Value::Integer(4), 50.0), + MostCommonValueWithFrequency::new(Value::Integer(5), 50.0), + MostCommonValueWithFrequency::new(Value::Integer(6), 150.0), + ], + buckets: vec![ + InitialBucket::First { + from: Value::Integer(0), + to: Value::Integer(78), + frequency: 375, + }, + InitialBucket::NonFirst { + from: Value::Integer(78), + to: Value::Integer(200), + frequency: 375, + }, + InitialBucket::NonFirst { + from: Value::Integer(200), + to: Value::Integer(780), + frequency: 375, + }, + InitialBucket::NonFirst { + from: Value::Integer(780), + to: Value::Integer(1800), + frequency: 375, + }, + ], + null_fraction: 0.2, + // 104 / (2500 * (1 - 0.2)) = 0.052 + distinct_values_fraction: 0.052, + }, + }), + ); + + RouterRuntimeMock { + metadata: RefCell::new(RouterConfigurationMock::new()), + virtual_tables: RefCell::new(HashMap::new()), + ir_cache: RefCell::new(cache), + table_statistics_cache: RefCell::new(table_statistics_cache), + initial_column_statistics_cache: RefCell::new(column_statistics_cache), + } + } + + #[allow(dead_code)] + pub fn add_virtual_table(&self, id: usize, table: VirtualTable) { + self.virtual_tables.borrow_mut().insert(id, table); + } +} + impl Coordinator for RouterRuntimeMock { type ParseTree = AbstractSyntaxTree; type Cache = LRUCache<String, Plan>; @@ -407,28 +685,88 @@ impl Coordinator for RouterRuntimeMock { } } -impl Default for RouterRuntimeMock { - fn default() -> Self { - Self::new() +impl Statistics for RouterRuntimeMock { + fn get_table_stats(&self, table_name: String) -> Result<Rc<TableStats>, SbroadError> { + if let Ok(borrow_res) = self.table_statistics_cache.try_borrow() { + if let Some(value) = borrow_res.get(table_name.as_str()) { + Ok(value.clone()) + } else { + Err(SbroadError::NotFound( + Entity::Statistics, + String::from("Mocked statistics for table {table_name} wasn't found"), + )) + } + } else { + Err(SbroadError::Invalid( + Entity::Statistics, + Some(String::from("Couldn't borrow table statistics")), + )) + } } -} -impl RouterRuntimeMock { - #[allow(dead_code)] - #[allow(clippy::missing_panics_doc)] - #[must_use] - pub fn new() -> Self { - let cache: LRUCache<String, Plan> = LRUCache::new(DEFAULT_CAPACITY, None).unwrap(); - RouterRuntimeMock { - metadata: RefCell::new(RouterConfigurationMock::new()), - virtual_tables: RefCell::new(HashMap::new()), - ir_cache: RefCell::new(cache), + fn get_initial_column_stats( + &self, + table_column_pair: TableColumnPair, + ) -> Result<Rc<InitialColumnStats>, SbroadError> { + if let Ok(borrow_res) = self.initial_column_statistics_cache.try_borrow() { + if let Some(value) = borrow_res.get(&table_column_pair) { + Ok(value.clone()) + } else { + Err(SbroadError::NotFound( + Entity::Statistics, + String::from( + "Mocked statistics for table/column pair {table_column_paid} wasn't found", + ), + )) + } + } else { + Err(SbroadError::Invalid( + Entity::Statistics, + Some(String::from("Couldn't borrow initial column statistics")), + )) } } - #[allow(dead_code)] - pub fn add_virtual_table(&self, id: usize, table: VirtualTable) { - self.virtual_tables.borrow_mut().insert(id, table); + fn update_table_stats_cache( + &mut self, + table_name: String, + table_stats: TableStats, + ) -> Result<(), SbroadError> { + if let Ok(mut borrow_res) = self.table_statistics_cache.try_borrow_mut() { + let value = borrow_res.get_mut(table_name.as_str()); + if let Some(value) = value { + *value = Rc::new(table_stats) + } else { + borrow_res.insert(table_name, Rc::new(table_stats)); + } + Ok(()) + } else { + Err(SbroadError::Invalid( + Entity::Statistics, + Some(String::from("Couldn't borrow table statistics")), + )) + } + } + + fn update_column_initial_stats_cache( + &self, + table_column_pair: TableColumnPair, + initial_column_stats: InitialColumnStats, + ) -> Result<(), SbroadError> { + if let Ok(mut borrow_res) = self.initial_column_statistics_cache.try_borrow_mut() { + let value = borrow_res.get_mut(&table_column_pair); + if let Some(value) = value { + *value = Rc::new(initial_column_stats) + } else { + borrow_res.insert(table_column_pair, Rc::new(initial_column_stats)); + } + Ok(()) + } else { + Err(SbroadError::Invalid( + Entity::Statistics, + Some(String::from("Couldn't borrow initial column statistics")), + )) + } } } -- GitLab