From 34042075eb472989f4f1b74b8a74bdb28ed612c1 Mon Sep 17 00:00:00 2001
From: Igor Kuznetsov <kuznetsovin@gmail.com>
Date: Wed, 2 Feb 2022 16:26:25 +0300
Subject: [PATCH] feat: determine subtree buckets for query with simple
 sharding key

---
 src/executor.rs                         |   1 +
 src/executor/shard.rs                   | 102 ++++++++++
 src/executor/shard/tests.rs             | 256 ++++++++++++++++++++++++
 src/ir/distribution.rs                  |  23 +++
 src/ir/expression.rs                    | 104 +++++++++-
 src/ir/transformation/redistribution.rs |   2 +-
 6 files changed, 485 insertions(+), 3 deletions(-)
 create mode 100644 src/executor/shard.rs
 create mode 100644 src/executor/shard/tests.rs

diff --git a/src/executor.rs b/src/executor.rs
index d4cfe2f4a2..ee8a83e656 100644
--- a/src/executor.rs
+++ b/src/executor.rs
@@ -1 +1,2 @@
 pub mod result;
+pub mod shard;
diff --git a/src/executor/shard.rs b/src/executor/shard.rs
new file mode 100644
index 0000000000..c2492ad14c
--- /dev/null
+++ b/src/executor/shard.rs
@@ -0,0 +1,102 @@
+use std::collections::HashSet;
+
+use traversal::DftPost;
+
+use crate::{
+    bucket::str_to_bucket_id,
+    errors::QueryPlannerError,
+    ir::{expression::Expression, Plan},
+};
+
+impl Plan {
+    /// Get`Bool` expression node where children are `Row` and `Const` types
+    ///
+    /// # Errors
+    /// - some of the expression nodes are invalid
+    #[allow(dead_code)]
+    fn get_eq_bool_nodes_with_row_const_children(&self, top_node_id: usize) -> Vec<usize> {
+        let mut nodes: Vec<usize> = Vec::new();
+
+        let post_tree = DftPost::new(&top_node_id, |node| self.nodes.subtree_iter(node));
+        for (_, node_id) in post_tree {
+            if self.is_bool_node_simple_eq(*node_id) {
+                nodes.push(*node_id);
+            }
+        }
+        nodes
+    }
+
+    /// Function evaluates buckets for the sending query.
+    ///
+    /// # Errors
+    /// - getting bucket id error
+    #[allow(dead_code)]
+    fn get_bucket_ids(
+        &mut self,
+        node_id: usize,
+        bucket_count: usize,
+    ) -> Result<Option<HashSet<usize>>, QueryPlannerError> {
+        let top_node = self.get_relation_node(node_id)?;
+
+        // Determine query sharding keys
+        let distribute_query_row = self.get_expression_node(top_node.output())?;
+        if distribute_query_row.has_unknown_distribution()? {
+            // sharding keys wasn't determine and query will be sent to all instance
+            return Ok(None);
+        }
+
+        let mut shard_keys = HashSet::new();
+        let filter_nodes = self.get_eq_bool_nodes_with_row_const_children(node_id);
+        for filter_id in filter_nodes {
+            let node = self.get_expression_node(filter_id)?;
+
+            if let Expression::Bool { left, right, .. } = node {
+                let mut row_id = *left;
+                let mut const_id = *right;
+
+                // if left node isn't row we must swap node
+                if !self.get_expression_node(row_id)?.is_row() {
+                    const_id = *right;
+                    row_id = *left;
+                }
+
+                // needs for temporary set empty distribution
+                // that must set after motion transformation (now doesn't work in this case)
+                if let Err(QueryPlannerError::UninitializedDistribution) =
+                    self.get_distribution(row_id)
+                {
+                    self.set_distribution(row_id)?;
+                }
+
+                let sharding_info = self.get_distribution(row_id)?;
+                if !sharding_info.is_unknown() {
+
+                    // Queries with the complex sharding key aren't support yet (ignore them)
+                    if sharding_info.get_segment_keys()?.len() > 1 {
+                        return Ok(None);
+                    }
+
+                    shard_keys.insert(
+                        self.get_expression_node(const_id)?
+                            .get_const_value()?
+                            .to_string(),
+                    );
+                }
+            }
+        }
+
+        if shard_keys.is_empty() {
+            return Ok(None);
+        }
+        let mut result = HashSet::new();
+        for k in shard_keys {
+            let hash = str_to_bucket_id(&k, bucket_count);
+            result.insert(hash);
+        }
+
+        Ok(Some(result))
+    }
+}
+
+#[cfg(test)]
+mod tests;
diff --git a/src/executor/shard/tests.rs b/src/executor/shard/tests.rs
new file mode 100644
index 0000000000..0dcf91d3b3
--- /dev/null
+++ b/src/executor/shard/tests.rs
@@ -0,0 +1,256 @@
+use crate::cache::Metadata;
+use crate::frontend::sql::ast::AbstractSyntaxTree;
+use pretty_assertions::assert_eq;
+use std::collections::HashSet;
+
+const CARTRIDGE_SCHEMA: &str = r#"spaces:
+  test_space:
+    engine: "memtx"
+    is_local: false
+    temporary: false
+    format:
+      - name: "id"
+        is_nullable: false
+        type: "number"
+      - name: "sysFrom"
+        is_nullable: false
+        type: "number"
+      - name: "FIRST_NAME"
+        is_nullable: false
+        type: "string"
+      - name: "sysOp"
+        is_nullable: false
+        type: "number"
+      - name: "bucket_id"
+        is_nullable: true
+        type: "unsigned"
+    indexes:
+      - type: "TREE"
+        name: "id"
+        unique: true
+        parts:
+          - path: "id"
+            type: "number"
+            is_nullable: false
+      - type: "TREE"
+        name: "bucket_id"
+        unique: false
+        parts:
+          - path: "bucket_id"
+            type: "unsigned"
+            is_nullable: true
+    sharding_key:
+      - id
+  test_space_hist:
+    engine: "memtx"
+    is_local: false
+    temporary: false
+    format:
+      - name: "id"
+        is_nullable: false
+        type: "number"
+      - name: "sysFrom"
+        is_nullable: false
+        type: "number"
+      - name: "FIRST_NAME"
+        is_nullable: false
+        type: "string"
+      - name: "sysOp"
+        is_nullable: false
+        type: "number"
+      - name: "bucket_id"
+        is_nullable: true
+        type: "unsigned"
+    indexes:
+      - type: "TREE"
+        name: "id"
+        unique: true
+        parts:
+          - path: "id"
+            type: "number"
+            is_nullable: false
+      - type: "TREE"
+        name: "bucket_id"
+        unique: false
+        parts:
+          - path: "bucket_id"
+            type: "unsigned"
+            is_nullable: true
+    sharding_key:
+      - id
+  hash_testing:
+    is_local: false
+    temporary: false
+    engine: "memtx"
+    format:
+      - name: "identification_number"
+        type: "integer"
+        is_nullable: false
+      - name: "product_code"
+        type: "string"
+        is_nullable: false
+      - name: "product_units"
+        type: "boolean"
+        is_nullable: false
+      - name: "sys_op"
+        type: "number"
+        is_nullable: false
+      - name: "bucket_id"
+        type: "unsigned"
+        is_nullable: true
+    indexes:
+      - name: "id"
+        unique: true
+        type: "TREE"
+        parts:
+          - path: "identification_number"
+            is_nullable: false
+            type: "integer"
+      - name: bucket_id
+        unique: false
+        parts:
+          - path: "bucket_id"
+            is_nullable: true
+            type: "unsigned"
+        type: "TREE"
+    sharding_key:
+      - identification_number
+      - product_code
+  hash_testing_hist:
+    is_local: false
+    temporary: false
+    engine: "memtx"
+    format:
+      - name: "identification_number"
+        type: "integer"
+        is_nullable: false
+      - name: "product_code"
+        type: "string"
+        is_nullable: false
+      - name: "product_units"
+        type: "boolean"
+        is_nullable: false
+      - name: "sys_op"
+        type: "number"
+        is_nullable: false
+      - name: "bucket_id"
+        type: "unsigned"
+        is_nullable: true
+    indexes:
+      - name: "id"
+        unique: true
+        type: "TREE"
+        parts:
+          - path: "identification_number"
+            is_nullable: false
+            type: "integer"
+      - name: bucket_id
+        unique: false
+        parts:
+          - path: "bucket_id"
+            is_nullable: true
+            type: "unsigned"
+        type: "TREE"
+    sharding_key:
+      - identification_number
+      - product_code"#;
+
+#[test]
+fn simple_union_query() {
+    let query = r#"SELECT * FROM (
+    SELECT * FROM "test_space" WHERE "sysFrom" > 0
+    UNION ALL
+    SELECT * FROM "test_space_hist" WHERE "sysFrom" < 0
+    ) as "t3"
+    WHERE "id" = 1"#;
+
+    let mut metadata = Metadata::new();
+    metadata.load(CARTRIDGE_SCHEMA).unwrap();
+
+    let ast = AbstractSyntaxTree::new(query).unwrap();
+    // let plan = ast.to_ir(&metadata).unwrap();
+    let mut plan = ast.to_ir(&metadata).unwrap();
+    plan.add_motions().unwrap();
+
+    let top = plan.get_top().unwrap();
+    let expected = HashSet::from([3940]);
+    assert_eq!(Some(expected), plan.get_bucket_ids(top, 30000).unwrap())
+}
+
+#[test]
+fn simple_disjunction_in_union_query() {
+    let query = r#"SELECT * FROM (
+    SELECT * FROM "test_space" WHERE "sysFrom" > 0
+    UNION ALL
+    SELECT * FROM "test_space_hist" WHERE "sysFrom" < 0
+    ) as "t3"
+    WHERE ("id" = 1) OR ("id" = 100)"#;
+
+    let mut metadata = Metadata::new();
+    metadata.load(CARTRIDGE_SCHEMA).unwrap();
+
+    let ast = AbstractSyntaxTree::new(query).unwrap();
+    let mut plan = ast.to_ir(&metadata).unwrap();
+    plan.add_motions().unwrap();
+
+    let top = plan.get_top().unwrap();
+    let expected = HashSet::from([3940, 18512]);
+
+    assert_eq!(Some(expected), plan.get_bucket_ids(top, 30000).unwrap())
+}
+
+#[test]
+fn complex_shard_key_union_query() {
+    let query = r#"SELECT *
+FROM
+    (SELECT "identification_number", "product_code"
+    FROM "hash_testing"
+    WHERE "sys_op" = 1
+    UNION ALL
+    SELECT "identification_number", "product_code"
+    FROM "hash_testing_hist"
+    WHERE "sys_op" > 1) AS "t3"
+WHERE "identification_number" = 1 AND "product_code" = '222'"#;
+
+    let mut metadata = Metadata::new();
+    metadata.load(CARTRIDGE_SCHEMA).unwrap();
+
+    let ast = AbstractSyntaxTree::new(query).unwrap();
+    let mut plan = ast.to_ir(&metadata).unwrap();
+    plan.add_motions().unwrap();
+
+    let top = plan.get_top().unwrap();
+    // let expected = vec![2927];
+
+    assert_eq!(None, plan.get_bucket_ids(top, 30000).unwrap())
+}
+
+#[test]
+fn union_complex_cond_query() {
+    let query = r#"SELECT *
+FROM
+    (SELECT "identification_number", "product_code"
+    FROM "hash_testing"
+    WHERE "sys_op" = 1
+    UNION ALL
+    SELECT "identification_number", "product_code"
+    FROM "hash_testing_hist"
+    WHERE "sys_op" > 1) AS "t3"
+WHERE ("identification_number" = 1
+    OR ("identification_number" = 100
+    OR "identification_number" = 1000))
+    AND ("product_code" = '222'
+    OR "product_code" = '111')"#;
+
+    let mut metadata = Metadata::new();
+    metadata.load(CARTRIDGE_SCHEMA).unwrap();
+
+    let ast = AbstractSyntaxTree::new(query).unwrap();
+    let mut plan = ast.to_ir(&metadata).unwrap();
+    plan.add_motions().unwrap();
+
+    let top = plan.get_top().unwrap();
+    // let expected = vec![2927, 22116, 6673, 4203, 23260, 6558];
+
+    assert_eq!(None, plan.get_bucket_ids(top, 30000).unwrap())
+}
diff --git a/src/ir/distribution.rs b/src/ir/distribution.rs
index a5b59b15c6..23dc37c568 100644
--- a/src/ir/distribution.rs
+++ b/src/ir/distribution.rs
@@ -96,6 +96,29 @@ impl Distribution {
             },
         }
     }
+
+    #[must_use]
+    pub fn is_unknown(&self) -> bool {
+        if let Distribution::Any = self {
+            return true;
+        }
+
+        false
+    }
+
+    /// Get sharding key index
+    ///
+    /// # Errors
+    /// Returns `QueryPlannerError` when distribution isn't set.
+    pub fn get_segment_keys(&self) -> Result<HashSet<Key>, QueryPlannerError> {
+        if let Distribution::Segment { keys } = self.clone() {
+            return Ok(keys);
+        }
+
+        Err(QueryPlannerError::CustomError(
+            "Distribution isn't segment".into(),
+        ))
+    }
 }
 
 impl Plan {
diff --git a/src/ir/expression.rs b/src/ir/expression.rs
index 1d2bd6563b..c132729840 100644
--- a/src/ir/expression.rs
+++ b/src/ir/expression.rs
@@ -6,10 +6,10 @@
 //! - distribution of the data in the tuple
 
 use super::distribution::Distribution;
-use super::operator;
 use super::value::Value;
-use super::{Node, Nodes, Plan};
+use super::{operator, Node, Nodes, Plan};
 use crate::errors::QueryPlannerError;
+use crate::ir::operator::Bool;
 use serde::{Deserialize, Serialize};
 use std::collections::HashSet;
 use traversal::DftPost;
@@ -128,6 +128,47 @@ impl Expression {
             )),
         }
     }
+
+    /// Checking determine distribution
+    ///
+    /// # Errors
+    /// - distribution isn't set
+    pub fn has_unknown_distribution(&self) -> Result<bool, QueryPlannerError> {
+        let d = self.distribution()?;
+        Ok(d.is_unknown())
+    }
+
+    /// Get value from const node
+    ///
+    /// # Errors
+    /// - node isn't constant type
+    pub fn get_const_value(&self) -> Result<Value, QueryPlannerError> {
+        if let Expression::Constant { value } = self.clone() {
+            return Ok(value);
+        }
+
+        Err(QueryPlannerError::CustomError(
+            "Node isn't const type".into(),
+        ))
+    }
+
+    /// The node is `Row` type checking
+    ///
+    /// # Errors
+    /// - node isn't `Row` type
+    #[must_use]
+    pub fn is_row(&self) -> bool {
+        matches!(self, Expression::Row { .. })
+    }
+
+    /// The node is `Const` type checking
+    ///
+    /// # Errors
+    /// - node isn't `Const` type
+    #[must_use]
+    pub fn is_const(&self) -> bool {
+        matches!(self, Expression::Constant { .. })
+    }
 }
 
 impl Nodes {
@@ -550,6 +591,65 @@ impl Plan {
         }
         Err(QueryPlannerError::InvalidRow)
     }
+
+    /// Validate that is `Bool` node with `Row` and `Const` children.
+    #[must_use]
+    pub fn is_bool_node_simple_eq(&self, node_id: usize) -> bool {
+        let node = match self.get_expression_node(node_id) {
+            Ok(e) => e,
+            Err(_) => return false,
+        };
+        if let Expression::Bool { left, op, right } = node {
+            if *op != Bool::Eq {
+                return false;
+            }
+
+            let left_node = match self.get_expression_node(*left) {
+                Ok(e) => e,
+                Err(_) => return false,
+            };
+
+            let right_node = match self.get_expression_node(*right) {
+                Ok(e) => e,
+                Err(_) => return false,
+            };
+
+            let left_is_valid = left_node.is_row() || left_node.is_const();
+            let right_is_valid = right_node.is_row() || right_node.is_const();
+            if left_is_valid && right_is_valid {
+                return true;
+            }
+        }
+
+        false
+    }
+
+    /// Extract `Const` value from `Row` by index
+    ///
+    /// # Errors
+    /// - node is not row
+    /// - row doesn't have const
+    /// - const value is invalid
+    #[allow(dead_code)]
+    pub fn get_child_const_from_row(
+        &self,
+        row_id: usize,
+        child_num: usize,
+    ) -> Result<Value, QueryPlannerError> {
+        let node = self.get_expression_node(row_id)?;
+        if let Expression::Row { list, .. } = node {
+            let const_node_id = list.get(child_num).ok_or_else(|| {
+                QueryPlannerError::CustomError(format!("Child {} wasn't found", child_num))
+            })?;
+
+            let v = self
+                .get_expression_node(*const_node_id)?
+                .get_const_value()?;
+
+            return Ok(v);
+        }
+        Err(QueryPlannerError::InvalidRow)
+    }
 }
 
 #[cfg(test)]
diff --git a/src/ir/transformation/redistribution.rs b/src/ir/transformation/redistribution.rs
index 0748c98fd1..029e415a3c 100644
--- a/src/ir/transformation/redistribution.rs
+++ b/src/ir/transformation/redistribution.rs
@@ -63,7 +63,7 @@ impl Plan {
     ///
     /// # Errors
     /// - some of the expression nodes are invalid
-    fn get_bool_nodes_with_row_children(
+    pub(crate) fn get_bool_nodes_with_row_children(
         &self,
         top: usize,
     ) -> Result<Vec<usize>, QueryPlannerError> {
-- 
GitLab