From 34042075eb472989f4f1b74b8a74bdb28ed612c1 Mon Sep 17 00:00:00 2001 From: Igor Kuznetsov <kuznetsovin@gmail.com> Date: Wed, 2 Feb 2022 16:26:25 +0300 Subject: [PATCH] feat: determine subtree buckets for query with simple sharding key --- src/executor.rs | 1 + src/executor/shard.rs | 102 ++++++++++ src/executor/shard/tests.rs | 256 ++++++++++++++++++++++++ src/ir/distribution.rs | 23 +++ src/ir/expression.rs | 104 +++++++++- src/ir/transformation/redistribution.rs | 2 +- 6 files changed, 485 insertions(+), 3 deletions(-) create mode 100644 src/executor/shard.rs create mode 100644 src/executor/shard/tests.rs diff --git a/src/executor.rs b/src/executor.rs index d4cfe2f4a2..ee8a83e656 100644 --- a/src/executor.rs +++ b/src/executor.rs @@ -1 +1,2 @@ pub mod result; +pub mod shard; diff --git a/src/executor/shard.rs b/src/executor/shard.rs new file mode 100644 index 0000000000..c2492ad14c --- /dev/null +++ b/src/executor/shard.rs @@ -0,0 +1,102 @@ +use std::collections::HashSet; + +use traversal::DftPost; + +use crate::{ + bucket::str_to_bucket_id, + errors::QueryPlannerError, + ir::{expression::Expression, Plan}, +}; + +impl Plan { + /// Get`Bool` expression node where children are `Row` and `Const` types + /// + /// # Errors + /// - some of the expression nodes are invalid + #[allow(dead_code)] + fn get_eq_bool_nodes_with_row_const_children(&self, top_node_id: usize) -> Vec<usize> { + let mut nodes: Vec<usize> = Vec::new(); + + let post_tree = DftPost::new(&top_node_id, |node| self.nodes.subtree_iter(node)); + for (_, node_id) in post_tree { + if self.is_bool_node_simple_eq(*node_id) { + nodes.push(*node_id); + } + } + nodes + } + + /// Function evaluates buckets for the sending query. + /// + /// # Errors + /// - getting bucket id error + #[allow(dead_code)] + fn get_bucket_ids( + &mut self, + node_id: usize, + bucket_count: usize, + ) -> Result<Option<HashSet<usize>>, QueryPlannerError> { + let top_node = self.get_relation_node(node_id)?; + + // Determine query sharding keys + let distribute_query_row = self.get_expression_node(top_node.output())?; + if distribute_query_row.has_unknown_distribution()? { + // sharding keys wasn't determine and query will be sent to all instance + return Ok(None); + } + + let mut shard_keys = HashSet::new(); + let filter_nodes = self.get_eq_bool_nodes_with_row_const_children(node_id); + for filter_id in filter_nodes { + let node = self.get_expression_node(filter_id)?; + + if let Expression::Bool { left, right, .. } = node { + let mut row_id = *left; + let mut const_id = *right; + + // if left node isn't row we must swap node + if !self.get_expression_node(row_id)?.is_row() { + const_id = *right; + row_id = *left; + } + + // needs for temporary set empty distribution + // that must set after motion transformation (now doesn't work in this case) + if let Err(QueryPlannerError::UninitializedDistribution) = + self.get_distribution(row_id) + { + self.set_distribution(row_id)?; + } + + let sharding_info = self.get_distribution(row_id)?; + if !sharding_info.is_unknown() { + + // Queries with the complex sharding key aren't support yet (ignore them) + if sharding_info.get_segment_keys()?.len() > 1 { + return Ok(None); + } + + shard_keys.insert( + self.get_expression_node(const_id)? + .get_const_value()? + .to_string(), + ); + } + } + } + + if shard_keys.is_empty() { + return Ok(None); + } + let mut result = HashSet::new(); + for k in shard_keys { + let hash = str_to_bucket_id(&k, bucket_count); + result.insert(hash); + } + + Ok(Some(result)) + } +} + +#[cfg(test)] +mod tests; diff --git a/src/executor/shard/tests.rs b/src/executor/shard/tests.rs new file mode 100644 index 0000000000..0dcf91d3b3 --- /dev/null +++ b/src/executor/shard/tests.rs @@ -0,0 +1,256 @@ +use crate::cache::Metadata; +use crate::frontend::sql::ast::AbstractSyntaxTree; +use pretty_assertions::assert_eq; +use std::collections::HashSet; + +const CARTRIDGE_SCHEMA: &str = r#"spaces: + test_space: + engine: "memtx" + is_local: false + temporary: false + format: + - name: "id" + is_nullable: false + type: "number" + - name: "sysFrom" + is_nullable: false + type: "number" + - name: "FIRST_NAME" + is_nullable: false + type: "string" + - name: "sysOp" + is_nullable: false + type: "number" + - name: "bucket_id" + is_nullable: true + type: "unsigned" + indexes: + - type: "TREE" + name: "id" + unique: true + parts: + - path: "id" + type: "number" + is_nullable: false + - type: "TREE" + name: "bucket_id" + unique: false + parts: + - path: "bucket_id" + type: "unsigned" + is_nullable: true + sharding_key: + - id + test_space_hist: + engine: "memtx" + is_local: false + temporary: false + format: + - name: "id" + is_nullable: false + type: "number" + - name: "sysFrom" + is_nullable: false + type: "number" + - name: "FIRST_NAME" + is_nullable: false + type: "string" + - name: "sysOp" + is_nullable: false + type: "number" + - name: "bucket_id" + is_nullable: true + type: "unsigned" + indexes: + - type: "TREE" + name: "id" + unique: true + parts: + - path: "id" + type: "number" + is_nullable: false + - type: "TREE" + name: "bucket_id" + unique: false + parts: + - path: "bucket_id" + type: "unsigned" + is_nullable: true + sharding_key: + - id + hash_testing: + is_local: false + temporary: false + engine: "memtx" + format: + - name: "identification_number" + type: "integer" + is_nullable: false + - name: "product_code" + type: "string" + is_nullable: false + - name: "product_units" + type: "boolean" + is_nullable: false + - name: "sys_op" + type: "number" + is_nullable: false + - name: "bucket_id" + type: "unsigned" + is_nullable: true + indexes: + - name: "id" + unique: true + type: "TREE" + parts: + - path: "identification_number" + is_nullable: false + type: "integer" + - name: bucket_id + unique: false + parts: + - path: "bucket_id" + is_nullable: true + type: "unsigned" + type: "TREE" + sharding_key: + - identification_number + - product_code + hash_testing_hist: + is_local: false + temporary: false + engine: "memtx" + format: + - name: "identification_number" + type: "integer" + is_nullable: false + - name: "product_code" + type: "string" + is_nullable: false + - name: "product_units" + type: "boolean" + is_nullable: false + - name: "sys_op" + type: "number" + is_nullable: false + - name: "bucket_id" + type: "unsigned" + is_nullable: true + indexes: + - name: "id" + unique: true + type: "TREE" + parts: + - path: "identification_number" + is_nullable: false + type: "integer" + - name: bucket_id + unique: false + parts: + - path: "bucket_id" + is_nullable: true + type: "unsigned" + type: "TREE" + sharding_key: + - identification_number + - product_code"#; + +#[test] +fn simple_union_query() { + let query = r#"SELECT * FROM ( + SELECT * FROM "test_space" WHERE "sysFrom" > 0 + UNION ALL + SELECT * FROM "test_space_hist" WHERE "sysFrom" < 0 + ) as "t3" + WHERE "id" = 1"#; + + let mut metadata = Metadata::new(); + metadata.load(CARTRIDGE_SCHEMA).unwrap(); + + let ast = AbstractSyntaxTree::new(query).unwrap(); + // let plan = ast.to_ir(&metadata).unwrap(); + let mut plan = ast.to_ir(&metadata).unwrap(); + plan.add_motions().unwrap(); + + let top = plan.get_top().unwrap(); + let expected = HashSet::from([3940]); + assert_eq!(Some(expected), plan.get_bucket_ids(top, 30000).unwrap()) +} + +#[test] +fn simple_disjunction_in_union_query() { + let query = r#"SELECT * FROM ( + SELECT * FROM "test_space" WHERE "sysFrom" > 0 + UNION ALL + SELECT * FROM "test_space_hist" WHERE "sysFrom" < 0 + ) as "t3" + WHERE ("id" = 1) OR ("id" = 100)"#; + + let mut metadata = Metadata::new(); + metadata.load(CARTRIDGE_SCHEMA).unwrap(); + + let ast = AbstractSyntaxTree::new(query).unwrap(); + let mut plan = ast.to_ir(&metadata).unwrap(); + plan.add_motions().unwrap(); + + let top = plan.get_top().unwrap(); + let expected = HashSet::from([3940, 18512]); + + assert_eq!(Some(expected), plan.get_bucket_ids(top, 30000).unwrap()) +} + +#[test] +fn complex_shard_key_union_query() { + let query = r#"SELECT * +FROM + (SELECT "identification_number", "product_code" + FROM "hash_testing" + WHERE "sys_op" = 1 + UNION ALL + SELECT "identification_number", "product_code" + FROM "hash_testing_hist" + WHERE "sys_op" > 1) AS "t3" +WHERE "identification_number" = 1 AND "product_code" = '222'"#; + + let mut metadata = Metadata::new(); + metadata.load(CARTRIDGE_SCHEMA).unwrap(); + + let ast = AbstractSyntaxTree::new(query).unwrap(); + let mut plan = ast.to_ir(&metadata).unwrap(); + plan.add_motions().unwrap(); + + let top = plan.get_top().unwrap(); + // let expected = vec![2927]; + + assert_eq!(None, plan.get_bucket_ids(top, 30000).unwrap()) +} + +#[test] +fn union_complex_cond_query() { + let query = r#"SELECT * +FROM + (SELECT "identification_number", "product_code" + FROM "hash_testing" + WHERE "sys_op" = 1 + UNION ALL + SELECT "identification_number", "product_code" + FROM "hash_testing_hist" + WHERE "sys_op" > 1) AS "t3" +WHERE ("identification_number" = 1 + OR ("identification_number" = 100 + OR "identification_number" = 1000)) + AND ("product_code" = '222' + OR "product_code" = '111')"#; + + let mut metadata = Metadata::new(); + metadata.load(CARTRIDGE_SCHEMA).unwrap(); + + let ast = AbstractSyntaxTree::new(query).unwrap(); + let mut plan = ast.to_ir(&metadata).unwrap(); + plan.add_motions().unwrap(); + + let top = plan.get_top().unwrap(); + // let expected = vec![2927, 22116, 6673, 4203, 23260, 6558]; + + assert_eq!(None, plan.get_bucket_ids(top, 30000).unwrap()) +} diff --git a/src/ir/distribution.rs b/src/ir/distribution.rs index a5b59b15c6..23dc37c568 100644 --- a/src/ir/distribution.rs +++ b/src/ir/distribution.rs @@ -96,6 +96,29 @@ impl Distribution { }, } } + + #[must_use] + pub fn is_unknown(&self) -> bool { + if let Distribution::Any = self { + return true; + } + + false + } + + /// Get sharding key index + /// + /// # Errors + /// Returns `QueryPlannerError` when distribution isn't set. + pub fn get_segment_keys(&self) -> Result<HashSet<Key>, QueryPlannerError> { + if let Distribution::Segment { keys } = self.clone() { + return Ok(keys); + } + + Err(QueryPlannerError::CustomError( + "Distribution isn't segment".into(), + )) + } } impl Plan { diff --git a/src/ir/expression.rs b/src/ir/expression.rs index 1d2bd6563b..c132729840 100644 --- a/src/ir/expression.rs +++ b/src/ir/expression.rs @@ -6,10 +6,10 @@ //! - distribution of the data in the tuple use super::distribution::Distribution; -use super::operator; use super::value::Value; -use super::{Node, Nodes, Plan}; +use super::{operator, Node, Nodes, Plan}; use crate::errors::QueryPlannerError; +use crate::ir::operator::Bool; use serde::{Deserialize, Serialize}; use std::collections::HashSet; use traversal::DftPost; @@ -128,6 +128,47 @@ impl Expression { )), } } + + /// Checking determine distribution + /// + /// # Errors + /// - distribution isn't set + pub fn has_unknown_distribution(&self) -> Result<bool, QueryPlannerError> { + let d = self.distribution()?; + Ok(d.is_unknown()) + } + + /// Get value from const node + /// + /// # Errors + /// - node isn't constant type + pub fn get_const_value(&self) -> Result<Value, QueryPlannerError> { + if let Expression::Constant { value } = self.clone() { + return Ok(value); + } + + Err(QueryPlannerError::CustomError( + "Node isn't const type".into(), + )) + } + + /// The node is `Row` type checking + /// + /// # Errors + /// - node isn't `Row` type + #[must_use] + pub fn is_row(&self) -> bool { + matches!(self, Expression::Row { .. }) + } + + /// The node is `Const` type checking + /// + /// # Errors + /// - node isn't `Const` type + #[must_use] + pub fn is_const(&self) -> bool { + matches!(self, Expression::Constant { .. }) + } } impl Nodes { @@ -550,6 +591,65 @@ impl Plan { } Err(QueryPlannerError::InvalidRow) } + + /// Validate that is `Bool` node with `Row` and `Const` children. + #[must_use] + pub fn is_bool_node_simple_eq(&self, node_id: usize) -> bool { + let node = match self.get_expression_node(node_id) { + Ok(e) => e, + Err(_) => return false, + }; + if let Expression::Bool { left, op, right } = node { + if *op != Bool::Eq { + return false; + } + + let left_node = match self.get_expression_node(*left) { + Ok(e) => e, + Err(_) => return false, + }; + + let right_node = match self.get_expression_node(*right) { + Ok(e) => e, + Err(_) => return false, + }; + + let left_is_valid = left_node.is_row() || left_node.is_const(); + let right_is_valid = right_node.is_row() || right_node.is_const(); + if left_is_valid && right_is_valid { + return true; + } + } + + false + } + + /// Extract `Const` value from `Row` by index + /// + /// # Errors + /// - node is not row + /// - row doesn't have const + /// - const value is invalid + #[allow(dead_code)] + pub fn get_child_const_from_row( + &self, + row_id: usize, + child_num: usize, + ) -> Result<Value, QueryPlannerError> { + let node = self.get_expression_node(row_id)?; + if let Expression::Row { list, .. } = node { + let const_node_id = list.get(child_num).ok_or_else(|| { + QueryPlannerError::CustomError(format!("Child {} wasn't found", child_num)) + })?; + + let v = self + .get_expression_node(*const_node_id)? + .get_const_value()?; + + return Ok(v); + } + Err(QueryPlannerError::InvalidRow) + } } #[cfg(test)] diff --git a/src/ir/transformation/redistribution.rs b/src/ir/transformation/redistribution.rs index 0748c98fd1..029e415a3c 100644 --- a/src/ir/transformation/redistribution.rs +++ b/src/ir/transformation/redistribution.rs @@ -63,7 +63,7 @@ impl Plan { /// /// # Errors /// - some of the expression nodes are invalid - fn get_bool_nodes_with_row_children( + pub(crate) fn get_bool_nodes_with_row_children( &self, top: usize, ) -> Result<Vec<usize>, QueryPlannerError> { -- GitLab