[PATCH] osmo-trx[master]: ssedetect: Add runtime CPU detection

historical

Review at  https://gerrit.osmocom.org/2100

ssedetect: Add runtime CPU detection

The current implementation can select the SSE support level during
compiletime only.

This commit adds functionality to automatically detect and switch
the SSE support level and automatically switch the Implementation
if the CPU does not support the required SSE level.

Change-Id: Iba74f8a6e4e921ff31e4bd9f0c7c881fe547423a
---
M Transceiver52M/arm/convolve.c
M Transceiver52M/common/convert.h
M Transceiver52M/common/convolve.h
M Transceiver52M/osmo-trx.cpp
M Transceiver52M/x86/convert.c
M Transceiver52M/x86/convolve.c
6 files changed, 142 insertions(+), 49 deletions(-)


  git pull ssh://gerrit.osmocom.org:29418/osmo-trx refs/changes/00/2100/1

diff --git a/Transceiver52M/arm/convolve.c b/Transceiver52M/arm/convolve.c
index 2b42090..912d0c2 100644
--- a/Transceiver52M/arm/convolve.c
+++ b/Transceiver52M/arm/convolve.c
@@ -58,6 +58,13 @@
 }
 #endif
 
+/* API: Initalize convolve module */
+void convolve_init(void)
+{
+	/* Stub */
+	return;
+}
+
 /* API: Aligned complex-real */
 int convolve_real(float *x, int x_len,
 		  float *h, int h_len,
diff --git a/Transceiver52M/common/convert.h b/Transceiver52M/common/convert.h
index 4827c28..1d3a180 100644
--- a/Transceiver52M/common/convert.h
+++ b/Transceiver52M/common/convert.h
@@ -3,5 +3,6 @@
 
 void convert_float_short(short *out, const float *in, float scale, int len);
 void convert_short_float(float *out, const short *in, int len);
+void convert_init(void);
 
 #endif /* _CONVERT_H_ */
diff --git a/Transceiver52M/common/convolve.h b/Transceiver52M/common/convolve.h
index 08bda0c..43db577 100644
--- a/Transceiver52M/common/convolve.h
+++ b/Transceiver52M/common/convolve.h
@@ -27,4 +27,6 @@
 			  int start, int len,
 			  int step, int offset);
 
+void convolve_init(void);
+
 #endif /* _CONVOLVE_H_ */
diff --git a/Transceiver52M/osmo-trx.cpp b/Transceiver52M/osmo-trx.cpp
index 5e81586..dff482e 100644
--- a/Transceiver52M/osmo-trx.cpp
+++ b/Transceiver52M/osmo-trx.cpp
@@ -32,6 +32,11 @@
 #include <Logger.h>
 #include <Configuration.h>
 
+extern "C" {
+#include "convolve.h"
+#include "convert.h"
+}
+
 /* Samples-per-symbol for downlink path
  *     4 - Uses precision modulator (more computation, less distortion)
  *     1 - Uses minimized modulator (less computation, more distortion)
@@ -498,6 +503,9 @@
 	RadioDevice::InterfaceType iface = RadioDevice::NORMAL;
 	struct trx_config config;
 
+	convolve_init();
+	convert_init();
+
 	handle_options(argc, argv, &config);
 
 	setup_signal_handlers();
diff --git a/Transceiver52M/x86/convert.c b/Transceiver52M/x86/convert.c
index 862a2e7..db1c0fc 100644
--- a/Transceiver52M/x86/convert.c
+++ b/Transceiver52M/x86/convert.c
@@ -25,6 +25,17 @@
 #include "config.h"
 #endif
 
+/* Architecture dependant function pointers */
+struct convert_cpu_context {
+	void (*convert_si16_ps_16n) (float *, const short *, int);
+	void (*convert_si16_ps) (float *, const short *, int);
+	void (*convert_scale_ps_si16_16n)(short *, const float *, float, int);
+	void (*convert_scale_ps_si16_8n)(short *, const float *, float, int);
+	void (*convert_scale_ps_si16)(short *, const float *, float, int);
+};
+
+static struct convert_cpu_context c;
+
 #ifdef HAVE_SSE3
 #include <xmmintrin.h>
 #include <emmintrin.h>
@@ -157,53 +168,61 @@
 		_mm_storeu_si128((__m128i *) &out[16 * i + 8], m7);
 	}
 }
-#else /* HAVE_SSE3 */
+#endif
+
+__attribute__((optimize("no-tree-vectorize")))
 static void convert_scale_ps_si16(short *out, const float *in,
 				  float scale, int len)
 {
 	for (int i = 0; i < len; i++)
 		out[i] = in[i] * scale;
 }
-#endif
 
-#ifndef HAVE_SSE4_1
+__attribute__((optimize("no-tree-vectorize")))
 static void convert_si16_ps(float *out, const short *in, int len)
 {
 	for (int i = 0; i < len; i++)
 		out[i] = in[i];
 }
+
+void convert_init(void)
+{
+	c.convert_scale_ps_si16_16n = convert_scale_ps_si16;
+	c.convert_scale_ps_si16_8n = convert_scale_ps_si16;
+	c.convert_scale_ps_si16 = convert_scale_ps_si16;
+	c.convert_si16_ps_16n = convert_si16_ps;
+	c.convert_si16_ps = convert_si16_ps;
+
+#ifdef HAVE_SSE4_1
+	if (__builtin_cpu_supports("sse4.1")) {
+		c.convert_si16_ps_16n = &_sse_convert_si16_ps_16n;
+		c.convert_si16_ps = &_sse_convert_si16_ps;
+	}
 #endif
+
+#ifdef HAVE_SSE3
+	if (__builtin_cpu_supports("sse3")) {
+		c.convert_scale_ps_si16_16n = _sse_convert_scale_ps_si16_16n;
+		c.convert_scale_ps_si16_8n = _sse_convert_scale_ps_si16_8n;
+		c.convert_scale_ps_si16 = _sse_convert_scale_ps_si16;
+	}
+#endif
+}
 
 void convert_float_short(short *out, const float *in, float scale, int len)
 {
-	void (*conv_func)(short *, const float *, float, int);
-
-#ifdef HAVE_SSE3
 	if (!(len % 16))
-		conv_func = _sse_convert_scale_ps_si16_16n;
+		c.convert_scale_ps_si16_16n(out, in, scale, len);
 	else if (!(len % 8))
-		conv_func = _sse_convert_scale_ps_si16_8n;
+		c.convert_scale_ps_si16_8n(out, in, scale, len);
 	else
-		conv_func = _sse_convert_scale_ps_si16;
-#else
-	conv_func = convert_scale_ps_si16;
-#endif
-
-	conv_func(out, in, scale, len);
+		c.convert_scale_ps_si16(out, in, scale, len);
 }
 
 void convert_short_float(float *out, const short *in, int len)
 {
-	void (*conv_func) (float *, const short *, int);
-
-#ifdef HAVE_SSE4_1
 	if (!(len % 16))
-		conv_func = _sse_convert_si16_ps_16n;
+		c.convert_si16_ps_16n(out, in, len);
 	else
-		conv_func = _sse_convert_si16_ps;
-#else
-	conv_func = convert_si16_ps;
-#endif
-
-	conv_func(out, in, len);
+		c.convert_si16_ps(out, in, len);
 }
diff --git a/Transceiver52M/x86/convolve.c b/Transceiver52M/x86/convolve.c
index e2a1dea..2f3b293 100644
--- a/Transceiver52M/x86/convolve.c
+++ b/Transceiver52M/x86/convolve.c
@@ -26,6 +26,31 @@
 #include "config.h"
 #endif
 
+/* Architecture dependant function pointers */
+struct convolve_cpu_context {
+	void (*conv_cmplx_4n) (const float *, int, const float *, int, float *,
+			       int, int, int, int, int);
+	void (*conv_cmplx_8n) (const float *, int, const float *, int, float *,
+			       int, int, int, int, int);
+	void (*conv_cmplx) (const float *, int, const float *, int, float *,
+			    int, int, int, int, int);
+	void (*conv_real4) (const float *, int, const float *, int, float *,
+			    int, int, int, int, int);
+	void (*conv_real8) (const float *, int, const float *, int, float *,
+			    int, int, int, int, int);
+	void (*conv_real12) (const float *, int, const float *, int, float *,
+			     int, int, int, int, int);
+	void (*conv_real16) (const float *, int, const float *, int, float *,
+			     int, int, int, int, int);
+	void (*conv_real20) (const float *, int, const float *, int, float *,
+			     int, int, int, int, int);
+	void (*conv_real4n) (const float *, int, const float *, int, float *,
+			     int, int, int, int, int);
+	void (*conv_real) (const float *, int, const float *, int, float *, int,
+			   int, int, int, int);
+};
+static struct convolve_cpu_context c;
+
 /* Forward declarations from base implementation */
 int _base_convolve_real(const float *x, int x_len,
 			const float *h, int h_len,
@@ -565,45 +590,77 @@
 }
 #endif
 
+/* API: Initalize convolve module */
+void convolve_init(void)
+{
+	c.conv_cmplx_4n = (void *)_base_convolve_complex;
+	c.conv_cmplx_8n = (void *)_base_convolve_complex;
+	c.conv_cmplx = (void *)_base_convolve_complex;
+	c.conv_real4 = (void *)_base_convolve_real;
+	c.conv_real8 = (void *)_base_convolve_real;
+	c.conv_real12 = (void *)_base_convolve_real;
+	c.conv_real16 = (void *)_base_convolve_real;
+	c.conv_real20 = (void *)_base_convolve_real;
+	c.conv_real4n = (void *)_base_convolve_real;
+	c.conv_real = (void *)_base_convolve_real;
+
+#ifdef HAVE_SSE3
+	if (__builtin_cpu_supports("sse3")) {
+		c.conv_cmplx_4n = sse_conv_cmplx_4n;
+		c.conv_cmplx_8n = sse_conv_cmplx_8n;
+		c.conv_real4 = sse_conv_real4;
+		c.conv_real8 = sse_conv_real8;
+		c.conv_real12 = sse_conv_real12;
+		c.conv_real16 = sse_conv_real16;
+		c.conv_real20 = sse_conv_real20;
+		c.conv_real4n = sse_conv_real4n;
+	}
+#endif
+}
+
 /* API: Aligned complex-real */
 int convolve_real(const float *x, int x_len,
 		  const float *h, int h_len,
 		  float *y, int y_len, int start, int len, int step, int offset)
 {
-	void (*conv_func) (const float *, int, const float *, int, float *, int,
-			   int, int, int, int) = (void *)_base_convolve_real;
-
 	if (bounds_check(x_len, h_len, y_len, start, len, step) < 0)
 		return -1;
 
 	memset(y, 0, len * 2 * sizeof(float));
 
-#ifdef HAVE_SSE3
 	if (step <= 4) {
 		switch (h_len) {
 		case 4:
-			conv_func = sse_conv_real4;
+			c.conv_real4(x, x_len, h, h_len, y, y_len, start, len,
+				     step, offset);
 			break;
 		case 8:
-			conv_func = sse_conv_real8;
+			c.conv_real8(x, x_len, h, h_len, y, y_len, start, len,
+				     step, offset);
 			break;
 		case 12:
-			conv_func = sse_conv_real12;
+			c.conv_real12(x, x_len, h, h_len, y, y_len, start, len,
+				      step, offset);
 			break;
 		case 16:
-			conv_func = sse_conv_real16;
+			c.conv_real16(x, x_len, h, h_len, y, y_len, start, len,
+				      step, offset);
 			break;
 		case 20:
-			conv_func = sse_conv_real20;
+			c.conv_real20(x, x_len, h, h_len, y, y_len, start, len,
+				      step, offset);
 			break;
 		default:
 			if (!(h_len % 4))
-				conv_func = sse_conv_real4n;
+				c.conv_real4n(x, x_len, h, h_len, y, y_len,
+					      start, len, step, offset);
+			else
+				c.conv_real(x, x_len, h, h_len, y, y_len, start,
+					    len, step, offset);
 		}
-	}
-#endif
-
-	conv_func(x, x_len, h, h_len, y, y_len, start, len, step, offset);
+	} else
+		c.conv_real(x, x_len, h, h_len, y, y_len, start, len, step,
+			    offset);
 
 	return len;
 }
@@ -614,25 +671,24 @@
 		     float *y, int y_len,
 		     int start, int len, int step, int offset)
 {
-	void (*conv_func) (const float *, int, const float *, int, float *, int,
-			   int, int, int, int) =
-	    (void *)_base_convolve_complex;
-
 	if (bounds_check(x_len, h_len, y_len, start, len, step) < 0)
 		return -1;
 
 	memset(y, 0, len * 2 * sizeof(float));
 
-#ifdef HAVE_SSE3
 	if (step <= 4) {
 		if (!(h_len % 8))
-			conv_func = sse_conv_cmplx_8n;
+			c.conv_cmplx_8n(x, x_len, h, h_len, y, y_len, start,
+					len, step, offset);
 		else if (!(h_len % 4))
-			conv_func = sse_conv_cmplx_4n;
-	}
-#endif
-
-	conv_func(x, x_len, h, h_len, y, y_len, start, len, step, offset);
+			c.conv_cmplx_4n(x, x_len, h, h_len, y, y_len, start,
+					len, step, offset);
+		else
+			c.conv_cmplx(x, x_len, h, h_len, y, y_len, start, len,
+				     step, offset);
+	} else
+		c.conv_cmplx(x, x_len, h, h_len, y, y_len, start, len, step,
+			     offset);
 
 	return len;
 }

-- 
To view, visit https://gerrit.osmocom.org/2100
To unsubscribe, visit https://gerrit.osmocom.org/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Iba74f8a6e4e921ff31e4bd9f0c7c881fe547423a
Gerrit-PatchSet: 1
Gerrit-Project: osmo-trx
Gerrit-Branch: master
Gerrit-Owner: dexter <pmaier at sysmocom.de>