From 8f4babc0f7d838f9661f73933aba58f4951d5920 Mon Sep 17 00:00:00 2001 From: EmirVildanov <reddog201030@gmail.com> Date: Wed, 22 Feb 2023 10:49:21 +0300 Subject: [PATCH] feat: add CBO statistics structures --- AUTHORS | 1 + sbroad-core/src/cbo.rs | 144 +++++++++++++++++++++++++++++++ sbroad-core/src/cbo/histogram.rs | 73 ++++++++++++++++ sbroad-core/src/errors.rs | 6 ++ sbroad-core/src/lib.rs | 1 + 5 files changed, 225 insertions(+) create mode 100644 sbroad-core/src/cbo.rs create mode 100644 sbroad-core/src/cbo/histogram.rs diff --git a/AUTHORS b/AUTHORS index 4a1c768d02..7f4a84c479 100644 --- a/AUTHORS +++ b/AUTHORS @@ -6,6 +6,7 @@ Arseniy Volynets Denis Smirnov Dmitriy Koltsov Dmitriy Travyan +Emir Vildanov Igor Kuznetsov NOTE: If you can commit a change to this list, please do not hesitate diff --git a/sbroad-core/src/cbo.rs b/sbroad-core/src/cbo.rs new file mode 100644 index 0000000000..41350b1e57 --- /dev/null +++ b/sbroad-core/src/cbo.rs @@ -0,0 +1,144 @@ +//! Cost Based Optimizer. +//! +//! Module used to optimize IR tree using statistics and plan cost calculation algorithms. + +use crate::cbo::histogram::Histogram; +use crate::errors::{Entity, SbroadError}; +use crate::ir::value::Value; +use std::collections::HashMap; + +/// Struct representing statistics for the whole table. +#[derive(Debug, Clone, PartialEq)] +pub(crate) struct TableStats { + /// Table name. + table_name: String, + /// Number of rows in the table. + rows_number: u64, + /// Counters of executed DML operations. + /// + /// We need them in order to understand when to + /// actualize table statistics. + /// + /// Note, that `upsert` command execution is handled by core in a view of + /// updating `update_counter` or `insert_counter` + insert_counter: u32, + update_counter: u32, + remove_counter: u32, +} + +/// Struct representing statistics for column. +/// +/// May represent transformed statistics, appeared during application +/// of CBO algorithms. Note, that transformation of column statistics must +/// be applied to every field of the structure. +/// +/// The reasons some values are stored in that structure and not in `Histogram` structure: +/// * Sometimes we do not want to receive whole histogram info. E.g. when +/// we don't want to apply WHERE and ON conditions, but want to estimate the size +/// of the table using only `avg_value_size` info. +/// * Some values may be useful for selectivity estimation +/// when histograms are on the stage of rebuilding and actualization. Such values as +/// MIN/MAX and `null_fraction` may be stored without histogram creation. +#[derive(Debug, Clone, PartialEq)] +pub(crate) struct ColumnStats<'col_stats> { + /// Number of elements in the column. + /// + /// Note, that the field is filled only ofter `TableStats` for the column table is retrieved. + elements_count: usize, + /// Min value in the column. + min_value: &'col_stats Value, + /// Max value in the column. + max_value: &'col_stats Value, + /// Average size of column row in bytes. + avg_value_size: u64, + /// Compressed histogram (equi-height histogram with mcv array). + /// + /// May have no values inside (`elements_count` field equal to 0) + /// it's always presented in `ColumnStats` structure. + histogram: &'col_stats Histogram<'col_stats>, +} + +/// Structure for global optimizations +/// that contains whole statistics information +/// which may be useful for optimization. +#[derive(Clone)] +#[allow(dead_code)] +pub(crate) struct CostBasedOptimizer<'cbo> { + /// Map of + /// { (Table name, Column name) -> ColumnStats } + /// that originates from `Scan` nodes during traversal of IR relational operators tree. + /// Used in `calculate_cost` function in the `Scan` node in order to retrieve stats for + /// requested columns. + initial_column_stats: HashMap<(String, String), ColumnStats<'cbo>>, + /// Vector of `Histogram` structures. + /// Initially it's filled with histograms gathered from storages. + /// It's updated with new histograms during the statistics transformation process: + /// every transformation like UNION, ARITHMETIC_MAP or other will create new histogram and + /// append it to the `histograms` vector. + histograms: Vec<Histogram<'cbo>>, + /// Vector of `Value` structures. + /// A storage of values used during the statistics transformation and application process. + /// In order not to store values in histogram `Bucket` and in `ColumnStats` structures + /// of histograms will store references to the values stored in this storage. + values_cache: Vec<Value>, +} + +#[allow(dead_code)] +impl<'cbo> CostBasedOptimizer<'cbo> { + fn new() -> Self { + CostBasedOptimizer { + initial_column_stats: HashMap::new(), + histograms: Vec::new(), + values_cache: Vec::new(), + } + } + + /// Get `initial_column_stats` map. + #[cfg(test)] + fn get_initial_column_stats(&self) -> &HashMap<(String, String), ColumnStats> { + &self.initial_column_stats + } + + /// Get value from `initial_column_stats` map by `key` + fn get_from_initial_column_stats(&self, key: &(String, String)) -> Option<&ColumnStats> { + self.initial_column_stats.get(key) + } + + /// Add new initial column stats to the `initial_column_stats` map. + fn update_initial_column_stats( + &'cbo mut self, + key: (String, String), + stats: ColumnStats<'cbo>, + ) -> Option<ColumnStats> { + self.initial_column_stats.insert(key, stats) + } + + /// Adds new histogram to the `histograms` vector. + /// Returns the reference to the newly added histogram. + fn push_histogram( + &'cbo mut self, + histogram: Histogram<'cbo>, + ) -> Result<&Histogram, SbroadError> { + self.histograms.push(histogram); + self.histograms.last().ok_or_else(|| { + SbroadError::Invalid( + Entity::Histogram, + Some(String::from("No values in the cbo histograms vector")), + ) + }) + } + + /// Adds new value to the `values_cache` vector. + /// Returns the reference to the newly added value. + fn push_value(&mut self, value: Value) -> Result<&Value, SbroadError> { + self.values_cache.push(value); + self.values_cache.last().ok_or_else(|| { + SbroadError::Invalid( + Entity::Value, + Some(String::from("No values in the cbo values cache")), + ) + }) + } +} + +mod histogram; diff --git a/sbroad-core/src/cbo/histogram.rs b/sbroad-core/src/cbo/histogram.rs new file mode 100644 index 0000000000..88a9b14d4a --- /dev/null +++ b/sbroad-core/src/cbo/histogram.rs @@ -0,0 +1,73 @@ +//! Equi-height histogram. +//! +//! Module used to represent logic of applying and transforming histogram statistics during +//! CBO algorithms. + +use crate::ir::value::Value; + +/// Helper structure that represents pair of most common value in the column and its frequency. +#[derive(Debug, PartialEq, Clone)] +struct MostCommonValueWithFrequency { + value: Value, + frequency: f64, +} + +impl MostCommonValueWithFrequency { + #[allow(dead_code)] + fn new(value: Value, frequency: f64) -> Self { + MostCommonValueWithFrequency { value, frequency } + } +} + +/// Representation of histogram bucket. +#[derive(Clone, Debug, PartialEq)] +struct Bucket<'bucket> { + /// From (left border) value of the bucket (not inclusive, except for the first bucket) + pub from: &'bucket Value, + /// To (right order) value of the bucket (inclusive) + pub to: &'bucket Value, + /// Bucket frequency. + /// Represents the number of elements stored in the bucket. + pub frequency: usize, +} + +/// Representation of equi-height histogram. +/// +/// It's assumed that if the histogram is present, then all +/// its fields are filled. +/// +/// As soon as the biggest part of the logic is taken from +/// `PostgreSQL` implementation, you may see `PostgreSQL lines` comments +/// in some places. It means you can find +/// implementation of `PostgreSQL` logic by searching the provided text. +/// +/// `PostgreSQL` version: `REL_15_2` +#[derive(Debug, PartialEq, Clone)] +pub struct Histogram<'histogram> { + // Most common values and their frequencies. + most_common: Vec<MostCommonValueWithFrequency>, + /// Histogram buckets. + /// + /// **Note**: Values from mcv are not included in histogram buckets. + /// + /// Boundaries: + /// * i = 0 -> [b_0; b_1] (where `from` field of the bucket is included) + /// * i = 1 -> (b_1; b_2] + /// * ... + /// * i = n -> (b_(n-2); b_(n-1)] + buckets: Vec<Bucket<'histogram>>, + /// Fraction of NULL values among all column values. + null_fraction: f64, + /// Number of distinct values for the whole histogram. + /// + /// **Note**: It is easy during the histogram calculation + /// phase to calculate ndv as soon as the elements have to be sorted + /// in order to construct bucket_bounds Vec. + ndv: usize, + /// Number of elements added into histogram. + /// + /// **Note**: the number of values added into histogram don't + /// have to be equal to the number of rows in the table as soon as + /// some rows might have been added after the histogram was created. + elements_count: usize, +} diff --git a/sbroad-core/src/errors.rs b/sbroad-core/src/errors.rs index a9032dd7b4..2649cb67b1 100644 --- a/sbroad-core/src/errors.rs +++ b/sbroad-core/src/errors.rs @@ -28,6 +28,8 @@ pub enum Entity { DistributionKey, /// corresponds to enum Expression Expression, + /// corresponds to struct Histogram + Histogram, /// tarantool index Index, /// corresponds to metadata field of struct ProducerResult @@ -68,6 +70,8 @@ pub enum Entity { SQLFunction, /// corresponds to struct Statement Statement, + /// corresponds to CBO statistics + Statistics, /// SQL sub-query SubQuery, /// sub-tree of the Plan @@ -106,6 +110,7 @@ impl fmt::Display for Entity { Entity::Distribution => "distribution".to_string(), Entity::DistributionKey => "distribution key".to_string(), Entity::Expression => "expression".to_string(), + Entity::Histogram => "histogram".to_string(), Entity::Index => "index".to_string(), Entity::Metadata => "metadata".to_string(), Entity::Motion => "motion".to_string(), @@ -126,6 +131,7 @@ impl fmt::Display for Entity { Entity::SpaceEngine => "space engine".to_string(), Entity::SQLFunction => "SQL function".to_string(), Entity::Statement => "statement".to_string(), + Entity::Statistics => "statistics".to_string(), Entity::SubQuery => "sub-query plan subtree".to_string(), Entity::SubTree => "execution plan subtree".to_string(), Entity::SyntaxNode => "syntax node".to_string(), diff --git a/sbroad-core/src/lib.rs b/sbroad-core/src/lib.rs index bdfb3600a8..f2cf398854 100644 --- a/sbroad-core/src/lib.rs +++ b/sbroad-core/src/lib.rs @@ -7,6 +7,7 @@ extern crate pest_derive; extern crate core; pub mod backend; +pub mod cbo; pub mod errors; pub mod executor; pub mod frontend; -- GitLab