From e8f7aa00ee94935714bf545e1ea9b49fc8a2de15 Mon Sep 17 00:00:00 2001
From: pcherenkov <pcherenkov@gmail.com>
Date: Wed, 4 Apr 2012 16:23:39 +0400
Subject: [PATCH] ported CRC32/cpu_feature to gcc-intrinsic-enabled code

---
 core/CMakeLists.txt   |   3 +
 core/cpu_feature.m    | 207 ++++++------------------------------------
 core/crc32.c          |   6 +-
 include/cpu_feature.h |  12 +--
 include/crc32.h       |   4 +-
 5 files changed, 37 insertions(+), 195 deletions(-)

diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt
index 8072287ed7..fb753cf491 100644
--- a/core/CMakeLists.txt
+++ b/core/CMakeLists.txt
@@ -48,6 +48,9 @@ execute_process(COMMAND ${CMAKE_COMMAND} -E touch_nocreate
 #
 set_property(DIRECTORY PROPERTY CLEAN_NO_CUSTOM 1)
 
+set_source_files_properties(cpu_feature.m
+    PROPERTIES COMPILE_FLAGS "-msse3 -msse4")
+
 #
 # Used by modules.
 #
diff --git a/core/cpu_feature.m b/core/cpu_feature.m
index 5ac4ad40ed..e9d2f22f47 100644
--- a/core/cpu_feature.m
+++ b/core/cpu_feature.m
@@ -27,202 +27,53 @@
 #include <errno.h>
 #include <stdlib.h>
 
-#include "cpu_feature.h"
-
-#if defined (__i386__) || defined (__x86_64__)
-
-enum { eAX=0, eBX, eCX, eDX };
-
-static const struct cpuid_feature {
-	unsigned int 	ri;
-	u_int32_t	bitmask;
-} cpu_mask[] = {
-	{eDX, (1 << 28)},	/* HT 		*/
-	{eCX, (1 << 19)},	/* SSE 4.1 	*/
-	{eCX, (1 << 20)},	/* SSE 4.2 	*/
-	{eCX, (1 << 31)}	/* HYPERV	*/
-};
-static const size_t LEN_cpu_mask = sizeof(cpu_mask) / sizeof (cpu_mask[0]);
-
-#define SCALE_F		sizeof(unsigned long)
-
-#if defined (__x86_64__)
-	#define REX_PRE "0x48, "
-#elif defined (__i386__)
-	#define REX_PRE
+#if !defined (__x86_64__) && !defined (__i386__)
+	#error "Only x86 and x86_64 architectures supported"
 #endif
 
+#ifndef __GNUC__
+	#error This module uses GCC intrinsic header(s) and should be compiled using gcc.
+#endif
 
-/* Hw-calculate CRC32 per byte (for the unaligned portion of data buffer). */
-/* NOTE: the function below was adopted from Linux 2.6 kernel source tree,
-   licensed under GPL. */
-static u_int32_t
-crc32c_hw_byte(u_int32_t crc, unsigned char const *data, size_t length)
-{
-	while (length--) {
-		__asm__ __volatile__(
-			".byte 0xf2, 0xf, 0x38, 0xf0, 0xf1"
-			:"=S"(crc)
-			:"0"(crc), "c"(*data)
-		);
-		data++;
-	}
+/* GCC intrinsic headers */
+#include <cpuid.h>
+#include <smmintrin.h>
 
-	return crc;
-}
+#include "cpu_feature.h"
 
 
-/* Hw-calculate CRC32 for the given data buffer.  */
-/* NOTE: the function below was adopted from Linux 2.6 kernel source tree,
-   licensed under GPL. */
-static u_int32_t
-crc32c_hw_intel(u_int32_t crc, unsigned char const *buf, size_t len)
+u_int32_t
+crc32c_hw(u_int32_t crc, const unsigned char *buf, unsigned int len)
 {
-	unsigned int iquotient = len / SCALE_F;
-	unsigned int iremainder = len % SCALE_F;
-	unsigned long *ptmp = (unsigned long *)buf;
-
-	while (iquotient--) {
-		__asm__ __volatile__(
-			".byte 0xf2, " REX_PRE "0xf, 0x38, 0xf1, 0xf1;"
-			:"=S"(crc)
-			:"0"(crc), "c"(*ptmp)
-		);
-		ptmp++;
-	}
-
-	if (iremainder) {
-		crc = crc32c_hw_byte(crc, (unsigned char *)ptmp,
-				 			iremainder);
-	}
-
-	return crc;
-}
+#define SCALE_F	sizeof(unsigned long)
+	size_t nwords = len / SCALE_F, nbytes = len % SCALE_F;
+	unsigned long *pword;
+	unsigned char *pbyte;
 
-
-/* Toggle x86 flag-register bits, as per mask. */
-static void
-toggle_x86_flags(long mask, long* orig, long* toggled)
-{
-	long forig = 0, fres = 0;
-
-#if defined (__i386__)
-	asm (
-		"pushfl; popl %%eax; movl %%eax, %0; xorl %2, %%eax; "
-		"pushl %%eax; popfl; pushfl; popl %%eax; pushl %0; popfl "
-		: "=r" (forig), "=a" (fres)
-		: "m" (mask)
-	);
-#elif __x86_64__
-	asm (
-		"pushfq; popq %%rax; movq %%rax, %0; xorq %2, %%rax; "
-		"pushq %%rax; popfq; pushfq; popq %%rax; pushq %0; popfq "
-		: "=r" (forig), "=a" (fres)
-		: "m" (mask)
-	);
+	for (pword = (unsigned long *)buf; nwords--; ++pword)
+#if defined (__x86_64__)
+		crc = (u_int32_t)_mm_crc32_u64((u_int64_t)crc, *pword);
+#elif defined (__i386__)
+		crc = _mm_crc32_u32(crc, *pword);
 #endif
 
-	if (orig)
-		*orig = forig;
-	if (toggled)
-		*toggled = fres;
-	return;
-}
-
+	if (nbytes)
+		for (pbyte = (unsigned char*)pword; nbytes--; ++pbyte)
+			crc = _mm_crc32_u8(crc, *pbyte);
 
-/* Is CPUID instruction available ? */
-static int
-can_cpuid()
-{
-	long of = -1, tf = -1;
-
-	/* x86 flag register masks */
-	enum {
-		cpuf_AC = (1 << 18), 	/* bit 18 */
-		cpuf_ID = (1 << 21)	/* bit 21 */
-	};
-
-
-	/* Check if AC (alignment) flag could be toggled:
-		if not - it's i386, thus no CPUID.
-	*/
-	toggle_x86_flags(cpuf_AC, &of, &tf);
-	if ((of & cpuf_AC) == (tf & cpuf_AC)) {
-		return 0;
-	}
-
-	/* Next try toggling CPUID (ID) flag. */
-	toggle_x86_flags(cpuf_ID, &of, &tf);
-	if ((of & cpuf_ID) == (tf & cpuf_ID)) {
-		return 0;
-	}
-
-	return 1;
-}
-
-
-/* Retrieve CPUID data using info as the EAX key. */
-static void
-get_cpuid(long info, long* eax, long* ebx, long* ecx, long *edx)
-{
-	*eax = info;
-
-#if defined (__i386__)
-	asm __volatile__ (
-		"movl %%ebx, %%edi; " 	/* must save ebx for 32-bit PIC code */
-		"cpuid; "
-		"movl %%ebx, %%esi; "
-		"movl %%edi, %%ebx; "
-		: "+a" (*eax), "=S" (*ebx), "=c" (*ecx), "=d" (*edx)
-		:
-		: "%edi"
-	);
-#elif defined (__x86_64__)
-	asm __volatile__ (
-		"cpuid; "
-		: "+a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx)
-	);
-#endif
+	return crc;
 }
 
 
-/* Check whether CPU has a certain feature. */
 bool
-cpu_has(unsigned int feature)
-{
-	long info = 1, reg[4] = {0,0,0,0};
-
-	if (!can_cpuid() || feature > LEN_cpu_mask)
-		return false;
-
-	get_cpuid(info, &reg[eAX], &reg[eBX], &reg[eCX], &reg[eDX]);
-
-	return (reg[cpu_mask[feature].ri] & cpu_mask[feature].bitmask) ? 1 : 0;
-}
-
-
-u_int32_t
-crc32c_hw(u_int32_t crc, const unsigned char *buf, unsigned int len)
+sse42_enabled_cpu()
 {
-	return crc32c_hw_intel (crc, (unsigned char const*)buf, len);
-}
-
-#else /* other (yet unsupported architectures) */
+	unsigned int ax, bx, cx, dx;
 
-bool
-cpu_has(unsigned int feature)
-{
-	(void)feature;
-	return false;
-}
+	if (__get_cpuid(1 /* level */, &ax, &bx, &cx, &dx) == 0)
+		return 0; /* not supported */
 
-u_int32_t
-crc32c_hw(u_int32_t crc, const unsigned char *buf, unsigned int len)
-{
-	(void)crc; (void)buf, (void)len;
-	abort();
-	return 0;
+	return (cx & (1 << 20)) != 0;
 }
 
-#endif /* defined (__i386__) || defined (__x86_64__) */
 
diff --git a/core/crc32.c b/core/crc32.c
index 097f651166..1d77af4936 100644
--- a/core/crc32.c
+++ b/core/crc32.c
@@ -38,10 +38,6 @@ crc32_func crc32_calc = NULL;
 void
 crc32_init()
 {
-#if defined (__i386__) || defined (__x86_64__)
-	crc32_calc = cpu_has(cpuf_sse4_2) ? &crc32c_hw : &crc32c;
-#else
-	crc32_calc = &crc32c;
-#endif
+	crc32_calc = sse42_enabled_cpu() ? &crc32c_hw : &crc32c;
 }
 
diff --git a/include/cpu_feature.h b/include/cpu_feature.h
index 5fdcaa899e..fad19e75e8 100644
--- a/include/cpu_feature.h
+++ b/include/cpu_feature.h
@@ -28,21 +28,13 @@
 #include <sys/types.h>
 #include <stdbool.h>
 
-/* CPU feature capabilities to use with cpu_has (feature). */
-
-#if defined (__i386__) || defined (__x86_64__)
-enum {
-	cpuf_ht = 0, cpuf_sse4_1, cpuf_sse4_2, cpuf_hypervisor
-};
-#endif
-
-/* Check whether CPU has a certain feature.
+/* Check whether CPU supports SSE 4.2 (needed to compute CRC32 in hardware).
  *
  * @param	feature		indetifier (see above) of the target feature
  *
  * @return	true if feature is available, false if unavailable.
  */
-bool cpu_has(unsigned int feature);
+bool sse42_enabled_cpu();
 
 
 /* Hardware-calculate CRC32 for the given data buffer.
diff --git a/include/crc32.h b/include/crc32.h
index f1dba110f2..1e5d0aa65e 100644
--- a/include/crc32.h
+++ b/include/crc32.h
@@ -28,10 +28,10 @@
  * THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
+#include <sys/types.h>
 #include <util.h>
 
-typedef u32 (*crc32_func)(u32 crc, const unsigned char *buf,
-			  unsigned int len);
+typedef u_int32_t (*crc32_func)(u_int32_t crc, const unsigned char *buf, unsigned int len);
 
 /*
  * Pointer to an architecture-specific implementation of
-- 
GitLab