From 8f4babc0f7d838f9661f73933aba58f4951d5920 Mon Sep 17 00:00:00 2001
From: EmirVildanov <reddog201030@gmail.com>
Date: Wed, 22 Feb 2023 10:49:21 +0300
Subject: [PATCH] feat: add CBO statistics structures

---
 AUTHORS                          |   1 +
 sbroad-core/src/cbo.rs           | 144 +++++++++++++++++++++++++++++++
 sbroad-core/src/cbo/histogram.rs |  73 ++++++++++++++++
 sbroad-core/src/errors.rs        |   6 ++
 sbroad-core/src/lib.rs           |   1 +
 5 files changed, 225 insertions(+)
 create mode 100644 sbroad-core/src/cbo.rs
 create mode 100644 sbroad-core/src/cbo/histogram.rs

diff --git a/AUTHORS b/AUTHORS
index 4a1c768d02..7f4a84c479 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -6,6 +6,7 @@ Arseniy Volynets
 Denis Smirnov
 Dmitriy Koltsov
 Dmitriy Travyan
+Emir Vildanov
 Igor Kuznetsov
 
 NOTE: If you can commit a change to this list, please do not hesitate
diff --git a/sbroad-core/src/cbo.rs b/sbroad-core/src/cbo.rs
new file mode 100644
index 0000000000..41350b1e57
--- /dev/null
+++ b/sbroad-core/src/cbo.rs
@@ -0,0 +1,144 @@
+//! Cost Based Optimizer.
+//!
+//! Module used to optimize IR tree using statistics and plan cost calculation algorithms.
+
+use crate::cbo::histogram::Histogram;
+use crate::errors::{Entity, SbroadError};
+use crate::ir::value::Value;
+use std::collections::HashMap;
+
+/// Struct representing statistics for the whole table.
+#[derive(Debug, Clone, PartialEq)]
+pub(crate) struct TableStats {
+    /// Table name.
+    table_name: String,
+    /// Number of rows in the table.
+    rows_number: u64,
+    /// Counters of executed DML operations.
+    ///
+    /// We need them in order to understand when to
+    /// actualize table statistics.
+    ///
+    /// Note, that `upsert` command execution is handled by core in a view of
+    /// updating `update_counter` or `insert_counter`
+    insert_counter: u32,
+    update_counter: u32,
+    remove_counter: u32,
+}
+
+/// Struct representing statistics for column.
+///
+/// May represent transformed statistics, appeared during application
+/// of CBO algorithms. Note, that transformation of column statistics must
+/// be applied to every field of the structure.
+///
+/// The reasons some values are stored in that structure and not in `Histogram` structure:
+/// * Sometimes we do not want to receive whole histogram info. E.g. when
+/// we don't want to apply WHERE and ON conditions, but want to estimate the size
+/// of the table using only `avg_value_size` info.
+/// * Some values may be useful for selectivity estimation
+/// when histograms are on the stage of rebuilding and actualization. Such values as
+/// MIN/MAX and `null_fraction` may be stored without histogram creation.
+#[derive(Debug, Clone, PartialEq)]
+pub(crate) struct ColumnStats<'col_stats> {
+    /// Number of elements in the column.
+    ///
+    /// Note, that the field is filled only ofter `TableStats` for the column table is retrieved.
+    elements_count: usize,
+    /// Min value in the column.
+    min_value: &'col_stats Value,
+    /// Max value in the column.
+    max_value: &'col_stats Value,
+    /// Average size of column row in bytes.
+    avg_value_size: u64,
+    /// Compressed histogram (equi-height histogram with mcv array).
+    ///
+    /// May have no values inside (`elements_count` field equal to 0)
+    /// it's always presented in `ColumnStats` structure.
+    histogram: &'col_stats Histogram<'col_stats>,
+}
+
+/// Structure for global optimizations
+/// that contains whole statistics information
+/// which may be useful for optimization.
+#[derive(Clone)]
+#[allow(dead_code)]
+pub(crate) struct CostBasedOptimizer<'cbo> {
+    /// Map of
+    /// { (Table name, Column name) -> ColumnStats }
+    /// that originates from `Scan` nodes during traversal of IR relational operators tree.
+    /// Used in `calculate_cost` function in the `Scan` node in order to retrieve stats for
+    /// requested columns.
+    initial_column_stats: HashMap<(String, String), ColumnStats<'cbo>>,
+    /// Vector of `Histogram` structures.
+    /// Initially it's filled with histograms gathered from storages.
+    /// It's updated with new histograms during the statistics transformation process:
+    /// every transformation like UNION, ARITHMETIC_MAP or other will create new histogram and
+    /// append it to the `histograms` vector.
+    histograms: Vec<Histogram<'cbo>>,
+    /// Vector of `Value` structures.
+    /// A storage of values used during the statistics transformation and application process.
+    /// In order not to store values in histogram `Bucket` and in `ColumnStats` structures
+    /// of histograms will store references to the values stored in this storage.
+    values_cache: Vec<Value>,
+}
+
+#[allow(dead_code)]
+impl<'cbo> CostBasedOptimizer<'cbo> {
+    fn new() -> Self {
+        CostBasedOptimizer {
+            initial_column_stats: HashMap::new(),
+            histograms: Vec::new(),
+            values_cache: Vec::new(),
+        }
+    }
+
+    /// Get `initial_column_stats` map.
+    #[cfg(test)]
+    fn get_initial_column_stats(&self) -> &HashMap<(String, String), ColumnStats> {
+        &self.initial_column_stats
+    }
+
+    /// Get value from `initial_column_stats` map by `key`
+    fn get_from_initial_column_stats(&self, key: &(String, String)) -> Option<&ColumnStats> {
+        self.initial_column_stats.get(key)
+    }
+
+    /// Add new initial column stats to the `initial_column_stats` map.
+    fn update_initial_column_stats(
+        &'cbo mut self,
+        key: (String, String),
+        stats: ColumnStats<'cbo>,
+    ) -> Option<ColumnStats> {
+        self.initial_column_stats.insert(key, stats)
+    }
+
+    /// Adds new histogram to the `histograms` vector.
+    /// Returns the reference to the newly added histogram.
+    fn push_histogram(
+        &'cbo mut self,
+        histogram: Histogram<'cbo>,
+    ) -> Result<&Histogram, SbroadError> {
+        self.histograms.push(histogram);
+        self.histograms.last().ok_or_else(|| {
+            SbroadError::Invalid(
+                Entity::Histogram,
+                Some(String::from("No values in the cbo histograms vector")),
+            )
+        })
+    }
+
+    /// Adds new value to the `values_cache` vector.
+    /// Returns the reference to the newly added value.
+    fn push_value(&mut self, value: Value) -> Result<&Value, SbroadError> {
+        self.values_cache.push(value);
+        self.values_cache.last().ok_or_else(|| {
+            SbroadError::Invalid(
+                Entity::Value,
+                Some(String::from("No values in the cbo values cache")),
+            )
+        })
+    }
+}
+
+mod histogram;
diff --git a/sbroad-core/src/cbo/histogram.rs b/sbroad-core/src/cbo/histogram.rs
new file mode 100644
index 0000000000..88a9b14d4a
--- /dev/null
+++ b/sbroad-core/src/cbo/histogram.rs
@@ -0,0 +1,73 @@
+//! Equi-height histogram.
+//!
+//! Module used to represent logic of applying and transforming histogram statistics during
+//! CBO algorithms.
+
+use crate::ir::value::Value;
+
+/// Helper structure that represents pair of most common value in the column and its frequency.
+#[derive(Debug, PartialEq, Clone)]
+struct MostCommonValueWithFrequency {
+    value: Value,
+    frequency: f64,
+}
+
+impl MostCommonValueWithFrequency {
+    #[allow(dead_code)]
+    fn new(value: Value, frequency: f64) -> Self {
+        MostCommonValueWithFrequency { value, frequency }
+    }
+}
+
+/// Representation of histogram bucket.
+#[derive(Clone, Debug, PartialEq)]
+struct Bucket<'bucket> {
+    /// From (left border) value of the bucket (not inclusive, except for the first bucket)
+    pub from: &'bucket Value,
+    /// To (right order) value of the bucket (inclusive)
+    pub to: &'bucket Value,
+    /// Bucket frequency.
+    /// Represents the number of elements stored in the bucket.
+    pub frequency: usize,
+}
+
+/// Representation of equi-height histogram.
+///
+/// It's assumed that if the histogram is present, then all
+/// its fields are filled.
+///
+/// As soon as the biggest part of the logic is taken from
+/// `PostgreSQL` implementation, you may see `PostgreSQL lines` comments
+/// in some places. It means you can find
+/// implementation of `PostgreSQL` logic by searching the provided text.
+///
+/// `PostgreSQL` version: `REL_15_2`
+#[derive(Debug, PartialEq, Clone)]
+pub struct Histogram<'histogram> {
+    // Most common values and their frequencies.
+    most_common: Vec<MostCommonValueWithFrequency>,
+    /// Histogram buckets.
+    ///
+    /// **Note**: Values from mcv are not included in histogram buckets.
+    ///
+    /// Boundaries:
+    /// * i = 0 -> [b_0; b_1] (where `from` field of the bucket is included)
+    /// * i = 1 -> (b_1; b_2]
+    /// * ...
+    /// * i = n -> (b_(n-2); b_(n-1)]
+    buckets: Vec<Bucket<'histogram>>,
+    /// Fraction of NULL values among all column values.
+    null_fraction: f64,
+    /// Number of distinct values for the whole histogram.
+    ///
+    /// **Note**: It is easy during the histogram calculation
+    /// phase to calculate ndv as soon as the elements have to be sorted
+    /// in order to construct bucket_bounds Vec.
+    ndv: usize,
+    /// Number of elements added into histogram.
+    ///
+    /// **Note**: the number of values added into histogram don't
+    /// have to be equal to the number of rows in the table as soon as
+    /// some rows might have been added after the histogram was created.
+    elements_count: usize,
+}
diff --git a/sbroad-core/src/errors.rs b/sbroad-core/src/errors.rs
index a9032dd7b4..2649cb67b1 100644
--- a/sbroad-core/src/errors.rs
+++ b/sbroad-core/src/errors.rs
@@ -28,6 +28,8 @@ pub enum Entity {
     DistributionKey,
     /// corresponds to enum Expression
     Expression,
+    /// corresponds to struct Histogram
+    Histogram,
     /// tarantool index
     Index,
     /// corresponds to metadata field of struct ProducerResult
@@ -68,6 +70,8 @@ pub enum Entity {
     SQLFunction,
     /// corresponds to struct Statement
     Statement,
+    /// corresponds to CBO statistics
+    Statistics,
     /// SQL sub-query
     SubQuery,
     /// sub-tree of the Plan
@@ -106,6 +110,7 @@ impl fmt::Display for Entity {
             Entity::Distribution => "distribution".to_string(),
             Entity::DistributionKey => "distribution key".to_string(),
             Entity::Expression => "expression".to_string(),
+            Entity::Histogram => "histogram".to_string(),
             Entity::Index => "index".to_string(),
             Entity::Metadata => "metadata".to_string(),
             Entity::Motion => "motion".to_string(),
@@ -126,6 +131,7 @@ impl fmt::Display for Entity {
             Entity::SpaceEngine => "space engine".to_string(),
             Entity::SQLFunction => "SQL function".to_string(),
             Entity::Statement => "statement".to_string(),
+            Entity::Statistics => "statistics".to_string(),
             Entity::SubQuery => "sub-query plan subtree".to_string(),
             Entity::SubTree => "execution plan subtree".to_string(),
             Entity::SyntaxNode => "syntax node".to_string(),
diff --git a/sbroad-core/src/lib.rs b/sbroad-core/src/lib.rs
index bdfb3600a8..f2cf398854 100644
--- a/sbroad-core/src/lib.rs
+++ b/sbroad-core/src/lib.rs
@@ -7,6 +7,7 @@ extern crate pest_derive;
 extern crate core;
 
 pub mod backend;
+pub mod cbo;
 pub mod errors;
 pub mod executor;
 pub mod frontend;
-- 
GitLab