diff --git a/changelogs/unreleased/gh-3389-cfg-option-memtx-sort-threads.md b/changelogs/unreleased/gh-3389-cfg-option-memtx-sort-threads.md
new file mode 100644
index 0000000000000000000000000000000000000000..9174ec4905238ba3bd37c3d7843a2c6089418dd3
--- /dev/null
+++ b/changelogs/unreleased/gh-3389-cfg-option-memtx-sort-threads.md
@@ -0,0 +1,5 @@
+## feature/box
+
+* Added the `box.cfg.memtx_sort_threads` parameter that specifies the number of
+  threads used to sort indexes keys on loading a memtx database. OpenMP is
+  not used to sort keys anymore (gh-3389).
diff --git a/cmake/BuildMisc.cmake b/cmake/BuildMisc.cmake
index 1a99f946bf876659c2de37b45566c9b8976ddeee..e6905204dec2099035703252ebf39cbdf3f97512 100644
--- a/cmake/BuildMisc.cmake
+++ b/cmake/BuildMisc.cmake
@@ -25,11 +25,6 @@ macro(libmisc_build)
         )
     endif()
 
-    if (HAVE_OPENMP)
-        list(APPEND misc_src
-             ${PROJECT_SOURCE_DIR}/third_party/qsort_arg_mt.c)
-    endif()
-
     add_library(misc STATIC ${misc_src})
     set_target_properties(misc PROPERTIES COMPILE_FLAGS "${DEPENDENCY_CFLAGS}")
 
diff --git a/cmake/compiler.cmake b/cmake/compiler.cmake
index c43ca41d25b469caf0a31cd17bb329f0bd1c29b7..14652a95a42027a2901237ddc7a8894de91f119e 100644
--- a/cmake/compiler.cmake
+++ b/cmake/compiler.cmake
@@ -67,18 +67,6 @@ if((NOT HAVE_STD_C11 AND NOT HAVE_STD_GNU99) OR
         "Please consider upgrade to gcc 4.5+ or clang 3.2+.")
 endif()
 
-#
-# Check for an omp support
-#
-set(CMAKE_REQUIRED_FLAGS "-fopenmp -Werror")
-check_cxx_source_compiles("int main(void) {
-#pragma omp parallel
-    {
-    }
-    return 0;
-}" HAVE_OPENMP)
-set(CMAKE_REQUIRED_FLAGS "")
-
 #
 # GCC started to warn for unused result starting from 4.2, and
 # this is when it introduced -Wno-unused-result
@@ -327,10 +315,6 @@ macro(enable_tnt_compile_flags)
     endif()
 endmacro(enable_tnt_compile_flags)
 
-if (HAVE_OPENMP)
-    add_compile_flags("C;CXX" "-fopenmp")
-endif()
-
 if (CMAKE_COMPILER_IS_CLANG OR CMAKE_COMPILER_IS_GNUCC)
     set(HAVE_BUILTIN_CTZ 1)
     set(HAVE_BUILTIN_CTZLL 1)
diff --git a/debian/copyright b/debian/copyright
index 93892bae56794721d05f5d900b219fc0c5e64a7a..646042b13e725127528228db59ebd27eb9427de2 100644
--- a/debian/copyright
+++ b/debian/copyright
@@ -181,7 +181,6 @@ Copyright: 2013, Red Hat, Inc
 License: GPL
 
 Files: third_party/qsort_arg.c
-    third_party/qsort_arg_mt.c
     third_party/compat/sys/bsd_time.h
     third_party/queue.h
     third_party/rb.h
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 84f1bf19808ceb5c70d9fe0d0be05c1cf4d7333e..77f89121e72622f5c3d9480aaab55aa86b64d748 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -349,13 +349,6 @@ include_directories(${EXTRA_BOX_INCLUDE_DIRS})
 set(TARANTOOL_C_FLAGS ${CMAKE_C_FLAGS} PARENT_SCOPE)
 set(TARANTOOL_CXX_FLAGS ${CMAKE_CXX_FLAGS} PARENT_SCOPE)
 
-if(BUILD_STATIC AND HAVE_OPENMP)
-    # Link libgomp explicitly to make it static. Avoid linking
-    # against DSO version of libgomp, which implied by -fopenmp
-    set (common_libraries ${common_libraries} "libgomp.a")
-    set(CMAKE_EXE_LINKER_FLAGS  "${CMAKE_EXE_LINKER_FLAGS} -fno-openmp")
-endif()
-
 set(exports_file_sources
     ${PROJECT_SOURCE_DIR}/extra/exports
     ${EXTRA_EXPORTS_FILE_SOURCES})
diff --git a/src/box/box.cc b/src/box/box.cc
index 84f1ba1e054ad5f9f57f0c3194d51cd026963776..b3c71ac1fdc887f3b0cf0f9c0658eac6e125ee22 100644
--- a/src/box/box.cc
+++ b/src/box/box.cc
@@ -97,6 +97,7 @@
 #include "small/static.h"
 #include "memory.h"
 #include "sqlLimit.h"
+#include "tt_sort.h"
 
 static char status[64] = "unconfigured";
 
@@ -1779,6 +1780,24 @@ box_init_say()
 	return 0;
 }
 
+/**
+ * Checks whether memtx_sort_threads configuration parameter is correct.
+ */
+static void
+box_check_memtx_sort_threads(void)
+{
+	int num = cfg_geti("memtx_sort_threads");
+	/*
+	 * After high level checks this parameter is either nil or has
+	 * type 'number'.
+	 */
+	if (cfg_isnumber("memtx_sort_threads") &&
+	    (num <= 0 || num > TT_SORT_THREADS_MAX))
+		tnt_raise(ClientError, ER_CFG, "memtx_sort_threads",
+			  tt_sprintf("must be greater than 0 and less than or"
+				     " equal to %d", TT_SORT_THREADS_MAX));
+}
+
 void
 box_check_config(void)
 {
@@ -1841,6 +1860,7 @@ box_check_config(void)
 		diag_raise();
 	if (box_check_txn_isolation() == txn_isolation_level_MAX)
 		diag_raise();
+	box_check_memtx_sort_threads();
 }
 
 int
@@ -4558,6 +4578,7 @@ engine_init()
 				    cfg_geti("slab_alloc_granularity"),
 				    cfg_gets("memtx_allocator"),
 				    cfg_getd("slab_alloc_factor"),
+				    cfg_geti("memtx_sort_threads"),
 				    box_on_indexes_built);
 	engine_register((struct engine *)memtx);
 	box_set_memtx_max_tuple_size();
diff --git a/src/box/lua/load_cfg.lua b/src/box/lua/load_cfg.lua
index 672031cf894ae88767f88a72e7c5ca062aca3a84..ca5aa9eae760b0dead9a58c384dd05a23235a862 100644
--- a/src/box/lua/load_cfg.lua
+++ b/src/box/lua/load_cfg.lua
@@ -201,6 +201,7 @@ local default_cfg = {
     sql_vdbe_max_steps    = 45000,
     txn_timeout           = 365 * 100 * 86400,
     txn_isolation         = "best-effort",
+    memtx_sort_threads    = nil,
 
     metrics     = {
         include = 'all',
@@ -393,6 +394,7 @@ local template_cfg = {
     sql_cache_size        = 'number',
     sql_vdbe_max_steps    = 'number',
     txn_timeout           = 'number',
+    memtx_sort_threads    = 'number',
 
     metrics = 'table',
 }
diff --git a/src/box/memtx_engine.cc b/src/box/memtx_engine.cc
index 8d9ded8789243d8a8b77761f1e7b94680df92a38..24926f5ca7e94504e91ecea4104c7cbf6a17cb85 100644
--- a/src/box/memtx_engine.cc
+++ b/src/box/memtx_engine.cc
@@ -59,6 +59,7 @@
 #include "memtx_space.h"
 #include "memtx_space_upgrade.h"
 #include "wal.h"
+#include "tt_sort.h"
 
 #include <type_traits>
 
@@ -1632,7 +1633,7 @@ struct memtx_engine *
 memtx_engine_new(const char *snap_dirname, bool force_recovery,
 		 uint64_t tuple_arena_max_size, uint32_t objsize_min,
 		 bool dontdump, unsigned granularity,
-		 const char *allocator, float alloc_factor,
+		 const char *allocator, float alloc_factor, int sort_threads,
 		 memtx_on_indexes_built_cb on_indexes_built)
 {
 	int64_t snap_signature;
@@ -1751,6 +1752,30 @@ memtx_engine_new(const char *snap_dirname, bool force_recovery,
 	memtx->state = MEMTX_INITIALIZED;
 	memtx->max_tuple_size = MAX_TUPLE_SIZE;
 	memtx->force_recovery = force_recovery;
+	if (sort_threads == 0) {
+		char *ompnum_str = getenv_safe("OMP_NUM_THREADS", NULL, 0);
+		if (ompnum_str != NULL) {
+			long ompnum = strtol(ompnum_str, NULL, 10);
+			if (ompnum > 0 && ompnum <= TT_SORT_THREADS_MAX) {
+				say_warn("OMP_NUM_THREADS is used to set number"
+					 " of sorting threads. Use cfg option"
+					 " 'memtx_sort_threads' instead.");
+				sort_threads = ompnum;
+			}
+			free(ompnum_str);
+		}
+		if (sort_threads == 0) {
+			sort_threads = sysconf(_SC_NPROCESSORS_ONLN);
+			if (sort_threads < 1) {
+				say_warn("Cannot get number of processors. "
+					 "Fallback to single processor.");
+				sort_threads = 1;
+			} else if (sort_threads > TT_SORT_THREADS_MAX) {
+				sort_threads = TT_SORT_THREADS_MAX;
+			}
+		}
+	}
+	memtx->sort_threads = sort_threads;
 
 	memtx->replica_join_cord = NULL;
 
diff --git a/src/box/memtx_engine.h b/src/box/memtx_engine.h
index 25ad39f533d25369161a04ba78a75b3080f9033e..00b633c4f88817dbc85f4ce62482ae0bf7e1ddc8 100644
--- a/src/box/memtx_engine.h
+++ b/src/box/memtx_engine.h
@@ -188,6 +188,11 @@ struct memtx_engine {
 	struct tuple_format *func_key_format;
 	/** Set of extents allocated using malloc. */
 	struct mh_ptr_t *malloc_extents;
+	/**
+	 * Number of threads used to sort keys of secondary indexes on engine
+	 * start.
+	 */
+	int sort_threads;
 };
 
 struct memtx_gc_task;
@@ -223,7 +228,7 @@ struct memtx_engine *
 memtx_engine_new(const char *snap_dirname, bool force_recovery,
 		 uint64_t tuple_arena_max_size, uint32_t objsize_min,
 		 bool dontdump, unsigned granularity,
-		 const char *allocator, float alloc_factor,
+		 const char *allocator, float alloc_factor, int threads_num,
 		 memtx_on_indexes_built_cb on_indexes_built);
 
 /**
@@ -364,13 +369,14 @@ memtx_engine_new_xc(const char *snap_dirname, bool force_recovery,
 		    uint64_t tuple_arena_max_size, uint32_t objsize_min,
 		    bool dontdump, unsigned granularity,
 		    const char *allocator, float alloc_factor,
+		    int sort_threads,
 		    memtx_on_indexes_built_cb on_indexes_built)
 {
 	struct memtx_engine *memtx;
 	memtx = memtx_engine_new(snap_dirname, force_recovery,
 				 tuple_arena_max_size, objsize_min, dontdump,
 				 granularity, allocator, alloc_factor,
-				 on_indexes_built);
+				 sort_threads, on_indexes_built);
 	if (memtx == NULL)
 		diag_raise();
 	return memtx;
diff --git a/src/box/memtx_tree.cc b/src/box/memtx_tree.cc
index dd15275a36e3df08f2468c624669be4f5505bbb1..a77c642f67b794911cae352e125e3da8a808bb4f 100644
--- a/src/box/memtx_tree.cc
+++ b/src/box/memtx_tree.cc
@@ -42,7 +42,7 @@
 #include "memtx_tx.h"
 #include "trivia/config.h"
 #include "trivia/util.h"
-#include <qsort_arg.h>
+#include "tt_sort.h"
 #include <small/mempool.h>
 
 /**
@@ -1850,9 +1850,10 @@ memtx_tree_index_end_build(struct index *base)
 	struct memtx_tree_index<USE_HINT> *index =
 		(struct memtx_tree_index<USE_HINT> *)base;
 	struct key_def *cmp_def = memtx_tree_cmp_def(&index->tree);
-	qsort_arg(index->build_array, index->build_array_size,
-		  sizeof(index->build_array[0]),
-		  memtx_tree_qcompare<USE_HINT>, cmp_def);
+	struct memtx_engine *memtx = (struct memtx_engine *)base->engine;
+	tt_sort(index->build_array, index->build_array_size,
+		sizeof(index->build_array[0]), memtx_tree_qcompare<USE_HINT>,
+		cmp_def, memtx->sort_threads);
 	if (cmp_def->is_multikey || cmp_def->for_func_index) {
 		/*
 		 * Multikey index may have equal(in terms of
diff --git a/src/lib/core/tt_sort.c b/src/lib/core/tt_sort.c
index 177daa0bfa84d6cac936eadb4643a2ab983fba25..474dc9442371e7a3c34dde16e45adb11e13c8574 100644
--- a/src/lib/core/tt_sort.c
+++ b/src/lib/core/tt_sort.c
@@ -99,7 +99,7 @@ static int
 find_bucket(struct sort_data *sort, void *elem)
 {
 	/*
-	 * Bucket count is `thread_count`, thus bucket boundraries (splitters)
+	 * Bucket count is `thread_count`, thus bucket boundaries (splitters)
 	 * count is `thread_count - 1` omitting most left and most right
 	 * boundaries. Let's place most left and most right boundaries at
 	 * imaginary indexes `-1` and `size of splitters` respectively.
@@ -175,9 +175,9 @@ sort_bucket(va_list ap)
 	struct sort_data *sort = worker->sort;
 
 	/* Sort this worker bucket. */
-	qsort_arg_st(sort->buffer + worker->bucket_begin * sort->elem_size,
-		     worker->bucket_size, sort->elem_size,
-		     sort->cmp, sort->cmp_arg);
+	qsort_arg(sort->buffer + worker->bucket_begin * sort->elem_size,
+		  worker->bucket_size, sort->elem_size,
+		  sort->cmp, sort->cmp_arg);
 
 	/* Move sorted data back from temporary space. */
 	memcpy(sort->data + worker->bucket_begin * sort->elem_size,
@@ -254,8 +254,8 @@ find_splitters(struct sort_data *sort)
 		       sort->data + i * sample_step * sort->elem_size,
 		       sort->elem_size);
 
-	qsort_arg_st(samples, samples_num, sort->elem_size, sort->cmp,
-		     sort->cmp_arg);
+	qsort_arg(samples, samples_num, sort->elem_size, sort->cmp,
+		  sort->cmp_arg);
 
 	/* Take splitters from samples. */
 	for (int i = 0; i < sort->thread_count - 1; i++) {
@@ -293,8 +293,8 @@ sort_all(va_list ap)
 {
 	struct sort_data *sort = va_arg(ap, typeof(sort));
 
-	qsort_arg_st(sort->data, sort->elem_count, sort->elem_size, sort->cmp,
-		     sort->cmp_arg);
+	qsort_arg(sort->data, sort->elem_count, sort->elem_size, sort->cmp,
+		  sort->cmp_arg);
 
 	return 0;
 }
@@ -353,7 +353,7 @@ tt_sort(void *data, size_t elem_count, size_t elem_size,
 	if (elem_count < NOSPAWN_SIZE_THESHOLD) {
 		say_verbose("data size is less than threshold %d,"
 			    " sort in caller thread", NOSPAWN_SIZE_THESHOLD);
-		qsort_arg_st(data, elem_count, elem_size, cmp, cmp_arg);
+		qsort_arg(data, elem_count, elem_size, cmp, cmp_arg);
 		return;
 	}
 
diff --git a/src/trivia/config.h.cmake b/src/trivia/config.h.cmake
index 0c7c285749bfc8085dfeecec6c187922d3f9b089..9dabd29b9bb1668218281c93f3ce85d3af11e87e 100644
--- a/src/trivia/config.h.cmake
+++ b/src/trivia/config.h.cmake
@@ -146,10 +146,6 @@
  * Defined if this is a big-endian system.
  */
 #cmakedefine HAVE_BYTE_ORDER_BIG_ENDIAN 1
-/*
- * Defined if this platform supports openmp and it is enabled
- */
-#cmakedefine HAVE_OPENMP 1
 /*
 *  Defined if compatible with GNU readline installed.
 */
diff --git a/test/box-luatest/gh_3389_cfg_option_memtx_sort_threads_test.lua b/test/box-luatest/gh_3389_cfg_option_memtx_sort_threads_test.lua
new file mode 100644
index 0000000000000000000000000000000000000000..4b9bb8c0826b35e72a52731606f3aef5857edc3f
--- /dev/null
+++ b/test/box-luatest/gh_3389_cfg_option_memtx_sort_threads_test.lua
@@ -0,0 +1,58 @@
+local server = require('luatest.server')
+local t = require('luatest')
+
+local g = t.group()
+
+g.before_all(function(cg)
+    -- test setting memtx_sort_threads to non default value
+    cg.server = server:new{}
+    cg.server:start()
+    cg.server:exec(function()
+        -- this will trigger calling tt_sort on box.cfg{}
+        local s = box.schema.space.create('test')
+        s:create_index('pri')
+        s:replace{2}
+        s:replace{1}
+        box.snapshot()
+    end)
+end)
+
+g.after_all(function(cg)
+    if cg.server ~= nil then
+        cg.server:drop()
+    end
+end)
+
+g.after_each(function(cg)
+    if cg.server ~= nil then
+        cg.server:stop()
+    end
+end)
+
+g.test_setting_cfg_option = function(cg)
+    cg.server = server:new{box_cfg = {memtx_sort_threads = 3}}
+    cg.server:start()
+    cg.server:exec(function()
+        t.assert_error_msg_equals(
+            "Can't set option 'memtx_sort_threads' dynamically",
+            box.cfg, {memtx_sort_threads = 5})
+    end)
+end
+
+g.test_setting_openmp_env_var = function(cg)
+    cg.server = server:new{box_cfg = {log_level = 'warn'},
+                           env = {OMP_NUM_THREADS = ' 3'}}
+    cg.server:start()
+    t.helpers.retrying({}, function()
+        local p = "OMP_NUM_THREADS is used to set number" ..
+                  " of sorting threads. Use cfg option" ..
+                  " 'memtx_sort_threads' instead."
+        t.assert_not_equals(cg.server:grep_log(p), nil)
+    end)
+    cg.server:stop()
+end
+
+g.test_setting_openmp_env_var_bad = function(cg)
+    cg.server = server:new{env= {OMP_NUM_THREADS = '300'}}
+    cg.server:start()
+end
diff --git a/test/box-tap/cfg.test.lua b/test/box-tap/cfg.test.lua
index 2d3e3a5222460dbd23f72c6e6284784da202b53e..ff42fa747ff8a079a8d8f744c3273c3c91b25fc7 100755
--- a/test/box-tap/cfg.test.lua
+++ b/test/box-tap/cfg.test.lua
@@ -6,7 +6,7 @@ local socket = require('socket')
 local fio = require('fio')
 local uuid = require('uuid')
 local msgpack = require('msgpack')
-test:plan(108)
+test:plan(112)
 
 --------------------------------------------------------------------------------
 -- Invalid values
@@ -50,6 +50,10 @@ invalid('vinyl_bloom_fpr', 0)
 invalid('vinyl_bloom_fpr', 1.1)
 invalid('wal_queue_max_size', -1)
 invalid('sql_vdbe_max_steps', -1)
+invalid('memtx_sort_threads', 'all')
+invalid('memtx_sort_threads', -1)
+invalid('memtx_sort_threads', 0)
+invalid('memtx_sort_threads', 257)
 
 local function invalid_combinations(name, val)
     local status, result = pcall(box.cfg, val)
diff --git a/third_party/qsort_arg.c b/third_party/qsort_arg.c
index 921e8c31e338f396bdae838bc2733c6669cd6c91..d27f7e51d34158cc7bdaa564f4f7c241af60662e 100644
--- a/third_party/qsort_arg.c
+++ b/third_party/qsort_arg.c
@@ -50,16 +50,6 @@
 #include <qsort_arg.h>
 #include <stdint.h>
 
-enum {
-	/*
-	 * The size of an array from which it is reasonable to start
-	 * multithread sort.
-	 * If the array size is less than the size below, a single-threaded
-	 * version of qsort will be started.S
-	 */
-	MULTITHREAD_SIZE_THRESHOLD = 128 * 1024,
-};
-
 #define min(a, b)   (a) < (b) ? a : b
 
 static char *med3(char *a, char *b, char *c,
@@ -119,7 +109,7 @@ med3(char *a, char *b, char *c, int (*cmp)(const void *a, const void *b, void *a
  * Single-thread version of qsort.
  */
 void
-qsort_arg_st(void *a, size_t n, size_t es, int (*cmp)(const void *a, const void *b, void *arg), void *arg)
+qsort_arg(void *a, size_t n, size_t es, int (*cmp)(const void *a, const void *b, void *arg), void *arg)
 {
 	char	   *pa,
 			   *pb,
@@ -208,7 +198,7 @@ loop:SWAPINIT(a, es);
 	{
 		/* Recurse on left partition, then iterate on right partition */
 		if (d1 > es)
-			qsort_arg_st(a, d1 / es, es, cmp, arg);
+			qsort_arg(a, d1 / es, es, cmp, arg);
 		if (d2 > es)
 		{
 			/* Iterate rather than recurse to save stack space */
@@ -222,7 +212,7 @@ loop:SWAPINIT(a, es);
 	{
 		/* Recurse on right partition, then iterate on left partition */
 		if (d2 > es)
-			qsort_arg_st(pn - d2, d2 / es, es, cmp, arg);
+			qsort_arg(pn - d2, d2 / es, es, cmp, arg);
 		if (d1 > es)
 		{
 			/* Iterate rather than recurse to save stack space */
@@ -232,30 +222,3 @@ loop:SWAPINIT(a, es);
 		}
 	}
 }
-
-#ifdef HAVE_OPENMP
-/**
- * Multi-thread version of qsort. Only present when target machine supports
- * open MP.
- */
-void qsort_arg_mt(void *a, size_t n, size_t es,
-		  int (*cmp)(const void *, const void *, void *), void *arg);
-#endif
-
-/**
- * General version of qsort that calls single-threaded of multi-threaded
- * qsort depending on open MP availability and given array size.
- */
-void
-qsort_arg(void *a, size_t n, size_t es,
-	  int (*cmp)(const void *a, const void *b, void *arg), void *arg)
-{
-#ifdef HAVE_OPENMP
-	if (n >= MULTITHREAD_SIZE_THRESHOLD)
-		qsort_arg_mt(a, n, es, cmp, arg);
-	else
-		qsort_arg_st(a, n, es, cmp, arg);
-#else
-	qsort_arg_st(a, n, es, cmp, arg);
-#endif
-}
diff --git a/third_party/qsort_arg.h b/third_party/qsort_arg.h
index 918d90ef1392b0298bed2a255484e1d74d0d5d0b..875a21029d8d3e67b8a3693cc20d6622abb2dc73 100644
--- a/third_party/qsort_arg.h
+++ b/third_party/qsort_arg.h
@@ -12,15 +12,8 @@ extern "C" {
  * Single-thread version of qsort.
  */
 void
-qsort_arg_st(void *a, size_t n, size_t es,
-	     int (*cmp)(const void *a, const void *b, void *arg), void *arg);
-
-/**
- * General version of qsort that calls single-threaded of multi-threaded
- * qsort depending on open MP availability and given array size.
- */
-void qsort_arg(void *a, size_t n, size_t es,
-	       int (*cmp)(const void *a, const void *b, void *arg), void *arg);
+qsort_arg(void *a, size_t n, size_t es,
+	  int (*cmp)(const void *a, const void *b, void *arg), void *arg);
 
 #if defined(__cplusplus)
 }
diff --git a/third_party/qsort_arg_mt.c b/third_party/qsort_arg_mt.c
deleted file mode 100644
index afb261cf06f429cd175ea8721b06c51edb5cea8d..0000000000000000000000000000000000000000
--- a/third_party/qsort_arg_mt.c
+++ /dev/null
@@ -1,261 +0,0 @@
-/*
- * Imported from PostgreSQL sources by Teodor Sigaev <teodor@sigaev.ru>, <sigaev@corp.mail.ru>
- */
-
-/*
- *	qsort_arg.c: qsort with a passthrough "void *" argument
- *
- *	Modifications from vanilla NetBSD source:
- *	  Add do ... while() macro fix
- *	  Remove __inline, _DIAGASSERTs, __P
- *	  Remove ill-considered "swap_cnt" switch to insertion sort,
- *	  in favor of a simple check for presorted input.
- *
- *	CAUTION: if you change this file, see also qsort.c
- *
- *	$PostgreSQL: pgsql/src/port/qsort_arg.c,v 1.4 2007/03/18 05:36:50 neilc Exp $
- */
-
-/*	$NetBSD: qsort.c,v 1.13 2003/08/07 16:43:42 agc Exp $	*/
-
-/*-
- * Copyright (c) 1992, 1993
- *	The Regents of the University of California.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *	  notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *	  notice, this list of conditions and the following disclaimer in the
- *	  documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the University nor the names of its contributors
- *	  may be used to endorse or promote products derived from this software
- *	  without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.	IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include <qsort_arg.h>
-#include <stdint.h>
-
-#if defined(__cplusplus)
-extern "C" {
-#endif /* defined(__cplusplus) */
-
-#ifndef HAVE_OPENMP
-#error "HAVE_OPENMP macro is not defined"
-#endif
-
-#define min(a, b)   (a) < (b) ? a : b
-
-static char *med3(char *a, char *b, char *c,
-	 int (*cmp)(const void *a, const void *b, void *arg), void *arg);
-static void swapfunc(char *, char *, size_t, int);
-
-/**
- * @brief Reduce the current number of threads in the thread pool to the
- * bare minimum. Doesn't prevent the pool from spawning new threads later
- * if demand mounts.
- */
-static void
-thread_pool_trim()
-{
-	/*
-	 * Trim OpenMP thread pool.
-	 * Though we lack the direct control the workaround below works for
-	 * GNU OpenMP library. The library stops surplus threads on entering
-	 * a parallel region. Can't go below 2 threads due to the
-	 * implementation quirk.
-	 */
-#pragma omp parallel num_threads(2)
-	;
-}
-
-
-/*
- * Qsort routine based on J. L. Bentley and M. D. McIlroy,
- * "Engineering a sort function",
- * Software--Practice and Experience 23 (1993) 1249-1265.
- * We have modified their original by adding a check for already-sorted input,
- * which seems to be a win per discussions on pgsql-hackers around 2006-03-21.
- */
-#define swapcode(TYPE, parmi, parmj, n) \
-do {		\
-	size_t i = (n) / sizeof (TYPE);			\
-	TYPE *pi = (TYPE *)(void *)(parmi);			\
-	TYPE *pj = (TYPE *)(void *)(parmj);			\
-	do {						\
-		TYPE	t = *pi;			\
-		*pi++ = *pj;				\
-		*pj++ = t;				\
-		} while (--i > 0);				\
-} while (0)
-
-#define SWAPINIT(a, es) swaptype = ((char *)(a) - (char *)0) % sizeof(long) || \
-	(es) % sizeof(long) ? 2 : (es) == sizeof(long)? 0 : 1;
-
-static void
-swapfunc(char *a, char *b, size_t n, int swaptype)
-{
-	if (swaptype <= 1)
-		swapcode(long, a, b, n);
-	else
-		swapcode(char, a, b, n);
-}
-
-#define swap(a, b)						\
-	if (swaptype == 0) {					\
-		long t = *(long *)(void *)(a);			\
-		*(long *)(void *)(a) = *(long *)(void *)(b);	\
-		*(long *)(void *)(b) = t;			\
-	} else							\
-		swapfunc(a, b, es, swaptype)
-
-#define vecswap(a, b, n) if ((n) > 0) swapfunc((a), (b), (size_t)(n), swaptype)
-
-static char *
-med3(char *a, char *b, char *c, int (*cmp)(const void *a, const void *b, void *arg), void *arg)
-{
-	return cmp(a, b, arg) < 0 ?
-		(cmp(b, c, arg) < 0 ? b : (cmp(a, c, arg) < 0 ? c : a))
-		: (cmp(b, c, arg) > 0 ? b : (cmp(a, c, arg) < 0 ? a : c));
-}
-
-static void
-qsort_arg_mt_internal(void *a, size_t n, intptr_t es,
-	     int (*cmp)(const void *a, const void *b, void *arg), void *arg)
-{
-	char *pa, *pb, *pc, *pd, *pl, *pm, *pn;
-	size_t d1, d2;
-	intptr_t r, swaptype, presorted;
-
-	loop:SWAPINIT(a, es);
-	if (n < 7)
-	{
-		for (pm = (char *) a + es; pm < (char *) a + n * es; pm += es)
-			for (pl = pm; pl > (char *) a && cmp(pl - es, pl, arg) > 0;
-				 pl -= es)
-				swap(pl, pl - es);
-		return;
-	}
-	presorted = 1;
-	for (pm = (char *) a + es; pm < (char *) a + n * es; pm += es)
-	{
-		if (cmp(pm - es, pm, arg) > 0)
-		{
-			presorted = 0;
-			break;
-		}
-	}
-	if (presorted)
-		return;
-	pm = (char *) a + (n / 2) * es;
-	if (n > 7)
-	{
-		pl = (char *) a;
-		pn = (char *) a + (n - 1) * es;
-		if (n > 40)
-		{
-			size_t d = (n / 8) * es;
-			pl = med3(pl, pl + d, pl + 2 * d, cmp, arg);
-			pm = med3(pm - d, pm, pm + d, cmp, arg);
-			pn = med3(pn - 2 * d, pn - d, pn, cmp, arg);
-		}
-		pm = med3(pl, pm, pn, cmp, arg);
-	}
-	swap((char*)a, pm);
-	pa = pb = (char *) a + es;
-	pc = pd = (char *) a + (n - 1) * es;
-	for (;;)
-	{
-		while (pb <= pc && (r = cmp(pb, a, arg)) <= 0)
-		{
-			if (r == 0)
-			{
-				swap(pa, pb);
-				pa += es;
-			}
-			pb += es;
-		}
-		while (pb <= pc && (r = cmp(pc, a, arg)) >= 0)
-		{
-			if (r == 0)
-			{
-				swap(pc, pd);
-				pd -= es;
-			}
-			pc -= es;
-		}
-		if (pb > pc)
-			break;
-		swap(pb, pc);
-		pb += es;
-		pc -= es;
-	}
-	pn = (char *) a + n * es;
-	d1 = min(pa - (char *) a, pb - pa);
-	vecswap(a, pb - d1, d1);
-	d1 = min(pd - pc, pn - pd - es);
-	vecswap(pb, pn - d1, d1);
-	d1 = pb - pa;
-	d2 = pd - pc;
-	if (d1 <= d2)
-	{
-		/* Recurse on left partition, then iterate on right partition */
-		if (d1 > es) {
-#pragma omp task
-			qsort_arg_mt_internal(a, d1 / es, es, cmp, arg);
-		}
-		if (d2 > es)
-		{
-			/* Iterate rather than recurse to save stack space */
-			/* qsort(pn - d2, d2 / es, es, cmp); */
-			a = pn - d2;
-			n = d2 / es;
-			goto loop;
-		}
-	}
-	else
-	{
-		/* Recurse on right partition, then iterate on left partition */
-		if (d2 > es) {
-#pragma omp task
-			qsort_arg_mt_internal(pn - d2, d2 / es, es, cmp, arg);
-		}
-		if (d1 > es)
-		{
-			/* Iterate rather than recurse to save stack space */
-			/* qsort(a, d1 / es, es, cmp); */
-			n = d1 / es;
-			goto loop;
-		}
-	}
-}
-
-void
-qsort_arg_mt(void *a, size_t n, size_t es,
-	     int (*cmp)(const void *a, const void *b, void *arg), void *arg)
-{
-#pragma omp parallel
-	{
-#pragma omp single
-		qsort_arg_mt_internal(a, n, es, cmp, arg);
-	}
-	thread_pool_trim();
-}
-
-#if defined(__cplusplus)
-}
-#endif /* defined(__cplusplus) */