From afd30b95816c7d3e631ae3d28d9b3ffc89689624 Mon Sep 17 00:00:00 2001
From: Vladimir Davydov <vdavydov.dev@gmail.com>
Date: Tue, 12 Mar 2019 19:45:13 +0300
Subject: [PATCH] bloom: do not use tuple_common_key_parts when constructing
 tuple bloom

Tuple bloom filter is an array of bloom filters, each of which reflects
lookups by all possible partial keys. To optimize the overall bloom
filter size, we need to know how many unique elements there are for each
partial key. To achieve that, we require the caller to pass the number
of key parts that have been hashed for the given tuple. Here's how it
looks in Vinyl:

        uint32_t hashed_parts = writer->last_stmt == NULL ? 0 :
                tuple_common_key_parts(stmt, writer->last_stmt,
                                       writer->key_def);
        tuple_bloom_builder_add(writer->bloom, stmt,
                                writer->key_def, hashed_parts);

Actually, there's no need in such a requirement as instead we can
calculate the hash value for the given tuple, compare it with the hash
of the tuple added last time, and add the new hash only if the two
values differ. This should be accurate enough while allowing us to get
rid of the cumbersome tuple_common_key_parts helper. Note, such a check
will only work if tuples are added in the order defined by the key
definition, but that already holds - anyway, one wouldn't be able to
use tuple_common_key_parts either if it wasn't true.

While we are at it, refresh the obsolete comment to tuple_bloom_builder.
---
 src/box/key_def.h        | 11 -----------
 src/box/tuple_bloom.c    | 15 ++++++++-------
 src/box/tuple_bloom.h    | 29 ++++++++++++++++++++++-------
 src/box/tuple_compare.cc | 33 ---------------------------------
 src/box/vy_run.c         | 10 ++--------
 5 files changed, 32 insertions(+), 66 deletions(-)

diff --git a/src/box/key_def.h b/src/box/key_def.h
index dd62f6a359..e3a34ce1ce 100644
--- a/src/box/key_def.h
+++ b/src/box/key_def.h
@@ -511,17 +511,6 @@ tuple_extract_key_raw(const char *data, const char *data_end,
 					      key_size);
 }
 
-/**
- * Return the length of the longest common prefix of two tuples.
- * @param tuple_a first tuple
- * @param tuple_b second tuple
- * @param key_def key defintion
- * @return number of key parts the two tuples have in common
- */
-uint32_t
-tuple_common_key_parts(const struct tuple *tuple_a, const struct tuple *tuple_b,
-		       struct key_def *key_def);
-
 /**
  * Compare keys using the key definition.
  * @param key_a key parts with MessagePack array header
diff --git a/src/box/tuple_bloom.c b/src/box/tuple_bloom.c
index cf887c7bf9..39d218dc02 100644
--- a/src/box/tuple_bloom.c
+++ b/src/box/tuple_bloom.c
@@ -72,8 +72,7 @@ tuple_bloom_builder_delete(struct tuple_bloom_builder *builder)
 
 int
 tuple_bloom_builder_add(struct tuple_bloom_builder *builder,
-			const struct tuple *tuple, struct key_def *key_def,
-			uint32_t hashed_parts)
+			const struct tuple *tuple, struct key_def *key_def)
 {
 	assert(builder->part_count == key_def->part_count);
 
@@ -82,17 +81,20 @@ tuple_bloom_builder_add(struct tuple_bloom_builder *builder,
 	uint32_t total_size = 0;
 
 	for (uint32_t i = 0; i < key_def->part_count; i++) {
+		struct tuple_hash_array *hash_arr = &builder->parts[i];
 		total_size += tuple_hash_key_part(&h, &carry, tuple,
 						  &key_def->parts[i]);
-		if (i < hashed_parts) {
+		uint32_t hash = PMurHash32_Result(h, carry, total_size);
+		if (hash_arr->count > 0 &&
+		    hash_arr->values[hash_arr->count - 1] == hash) {
 			/*
 			 * This part is already in the bloom, proceed
-			 * to the next one. Note, we can't skip to
-			 * hashed_parts, as we need to compute the hash.
+			 * to the next one. Note, this check only works
+			 * if tuples are added in the order defined by
+			 * the key definition.
 			 */
 			continue;
 		}
-		struct tuple_hash_array *hash_arr = &builder->parts[i];
 		if (hash_arr->count >= hash_arr->capacity) {
 			uint32_t capacity = MAX(hash_arr->capacity * 2, 1024U);
 			uint32_t *values = realloc(hash_arr->values,
@@ -105,7 +107,6 @@ tuple_bloom_builder_add(struct tuple_bloom_builder *builder,
 			hash_arr->capacity = capacity;
 			hash_arr->values = values;
 		}
-		uint32_t hash = PMurHash32_Result(h, carry, total_size);
 		hash_arr->values[hash_arr->count++] = hash;
 	}
 	return 0;
diff --git a/src/box/tuple_bloom.h b/src/box/tuple_bloom.h
index b05fee184c..b33344fc8f 100644
--- a/src/box/tuple_bloom.h
+++ b/src/box/tuple_bloom.h
@@ -80,9 +80,27 @@ struct tuple_hash_array {
  * Tuple bloom filter builder.
  *
  * Construction of a bloom filter proceeds as follows.
- * First all tuple hashes are stored in a builder object.
- * Once all hashes have been stored, a bloom filter of
- * the optimal size and all the hashes are added to it.
+ *
+ * First, tuples of the target set are added to a builder object.
+ * For further calculations to be correct, tuples MUST be added
+ * in the order defined by the provided key definition. The builder
+ * object stores hashes of all added tuples for each partial key,
+ * e.g. for tuple (1, 2, 3) it stores hashes of (1), (1, 2), and
+ * (1, 2, 3) in separate arrays. It doesn't store the same hash
+ * multiple times in the same array (that's what the order is
+ * required for), thus it knows how many unique encounters of each
+ * partial key are there.
+ *
+ * Once all tuples have been hashed, the builder can be used to
+ * create a bloom filter having the given false positive rate
+ * for all lookups, both by full and by partial key. Since when
+ * checking a tuple against a bloom filter, we check not only the
+ * full key bloom, but also all partial key blooms, the actual
+ * FPR of checking keys consisting of i parts will be equal to
+ * the multiplication of FPRs of individual bloom filters storing
+ * hashes of parts <= i. This allows us to use smaller FPR for
+ * partial bloom filters and hence reduce the bloom filter size.
+ * For more details, see tuple_bloom_new() implementation.
  */
 struct tuple_bloom_builder {
 	/** Number of key parts. */
@@ -111,14 +129,11 @@ tuple_bloom_builder_delete(struct tuple_bloom_builder *builder);
  * @param builder - bloom filter builder
  * @param tuple - tuple to add
  * @param key_def - key definition
- * @param hashed_parts - number of key parts that have already
- *  been added to the builder
  * @return 0 on success, -1 on OOM
  */
 int
 tuple_bloom_builder_add(struct tuple_bloom_builder *builder,
-			const struct tuple *tuple, struct key_def *key_def,
-			uint32_t hashed_parts);
+			const struct tuple *tuple, struct key_def *key_def);
 
 /**
  * Create a new tuple bloom filter.
diff --git a/src/box/tuple_compare.cc b/src/box/tuple_compare.cc
index cf4519ccbf..dfe30b190e 100644
--- a/src/box/tuple_compare.cc
+++ b/src/box/tuple_compare.cc
@@ -425,39 +425,6 @@ tuple_compare_field_with_hint(const char *field_a, enum mp_type a_type,
 	}
 }
 
-uint32_t
-tuple_common_key_parts(const struct tuple *tuple_a, const struct tuple *tuple_b,
-		       struct key_def *key_def)
-{
-	uint32_t i;
-	struct tuple_format *tuple_a_format = tuple_format(tuple_a);
-	struct tuple_format *tuple_b_format = tuple_format(tuple_b);
-	const char *tuple_a_raw = tuple_data(tuple_a);
-	const char *tuple_b_raw = tuple_data(tuple_b);
-	const uint32_t *tuple_a_field_map = tuple_field_map(tuple_a);
-	const uint32_t *tuple_b_field_map = tuple_field_map(tuple_b);
-	for (i = 0; i < key_def->part_count; i++) {
-		struct key_part *part = (struct key_part *)&key_def->parts[i];
-		const char *field_a =
-			tuple_field_raw_by_part(tuple_a_format, tuple_a_raw,
-						tuple_a_field_map, part);
-		const char *field_b =
-			tuple_field_raw_by_part(tuple_b_format, tuple_b_raw,
-						tuple_b_field_map, part);
-		enum mp_type a_type = field_a != NULL ?
-				      mp_typeof(*field_a) : MP_NIL;
-		enum mp_type b_type = field_b != NULL ?
-				      mp_typeof(*field_b) : MP_NIL;
-		if (a_type == MP_NIL && b_type == MP_NIL)
-			continue;
-		if (a_type == MP_NIL || b_type == MP_NIL ||
-		    tuple_compare_field_with_hint(field_a, a_type,
-				field_b, b_type, part->type, part->coll) != 0)
-			break;
-	}
-	return i;
-}
-
 template<bool is_nullable, bool has_optional_parts, bool has_json_paths>
 static inline int
 tuple_compare_slowpath(const struct tuple *tuple_a, const struct tuple *tuple_b,
diff --git a/src/box/vy_run.c b/src/box/vy_run.c
index 3c096b0a82..e2123fe7f1 100644
--- a/src/box/vy_run.c
+++ b/src/box/vy_run.c
@@ -2178,11 +2178,8 @@ static int
 vy_run_writer_write_to_page(struct vy_run_writer *writer, struct tuple *stmt)
 {
 	if (writer->bloom != NULL) {
-		uint32_t hashed_parts = writer->last_stmt == NULL ? 0 :
-			tuple_common_key_parts(stmt, writer->last_stmt,
-					       writer->key_def);
 		tuple_bloom_builder_add(writer->bloom, stmt,
-					writer->key_def, hashed_parts);
+					writer->key_def);
 	}
 	if (writer->last_stmt != NULL)
 		vy_stmt_unref_if_possible(writer->last_stmt);
@@ -2409,11 +2406,8 @@ vy_run_rebuild_index(struct vy_run *run, const char *dir,
 			if (tuple == NULL)
 				goto close_err;
 			if (bloom_builder != NULL) {
-				uint32_t hashed_parts = prev_tuple == NULL ? 0 :
-					tuple_common_key_parts(prev_tuple,
-							       tuple, key_def);
 				tuple_bloom_builder_add(bloom_builder, tuple,
-							key_def, hashed_parts);
+							key_def);
 			}
 			key = tuple_extract_key(tuple, cmp_def, NULL);
 			if (prev_tuple != NULL)
-- 
GitLab