This is merely a historical archive of years 2008-2021, before the migration to mailman3.
A maintained and still updated list archive can be found at https://lists.osmocom.org/hyperkitty/list/gerrit-log@lists.osmocom.org/.
Tom Tsou gerrit-no-reply at lists.osmocom.orgTom Tsou has submitted this change and it was merged. Change subject: cosmetic: Make parameter lists uniform ...................................................................... cosmetic: Make parameter lists uniform The non-sse implementation and the sse implementation of the convert and convolve functions have different parameter lists. This makes it difficult to use function pointers in order to select the right function depending on the SSE-Level and CPU. This commit uniformizes the parameter lists in preparation for planned runtime cpu detection support Change-Id: Ice063b89791537c4b591751f12f5ef5c413a2d27 --- M Transceiver52M/x86/convert.c M Transceiver52M/x86/convolve.c 2 files changed, 143 insertions(+), 110 deletions(-) Approvals: Tom Tsou: Looks good to me, approved Harald Welte: Looks good to me, but someone else must approve Jenkins Builder: Verified diff --git a/Transceiver52M/x86/convert.c b/Transceiver52M/x86/convert.c index eafe7b2..862a2e7 100644 --- a/Transceiver52M/x86/convert.c +++ b/Transceiver52M/x86/convert.c @@ -176,26 +176,34 @@ void convert_float_short(short *out, const float *in, float scale, int len) { + void (*conv_func)(short *, const float *, float, int); + #ifdef HAVE_SSE3 if (!(len % 16)) - _sse_convert_scale_ps_si16_16n(out, in, scale, len); + conv_func = _sse_convert_scale_ps_si16_16n; else if (!(len % 8)) - _sse_convert_scale_ps_si16_8n(out, in, scale, len); + conv_func = _sse_convert_scale_ps_si16_8n; else - _sse_convert_scale_ps_si16(out, in, scale, len); + conv_func = _sse_convert_scale_ps_si16; #else - convert_scale_ps_si16(out, in, scale, len); + conv_func = convert_scale_ps_si16; #endif + + conv_func(out, in, scale, len); } void convert_short_float(float *out, const short *in, int len) { + void (*conv_func) (float *, const short *, int); + #ifdef HAVE_SSE4_1 if (!(len % 16)) - _sse_convert_si16_ps_16n(out, in, len); + conv_func = _sse_convert_si16_ps_16n; else - _sse_convert_si16_ps(out, in, len); + conv_func = _sse_convert_si16_ps; #else - convert_si16_ps(out, in, len); + conv_func = convert_si16_ps; #endif + + conv_func(out, in, len); } diff --git a/Transceiver52M/x86/convolve.c b/Transceiver52M/x86/convolve.c index 04923bc..e2a1dea 100644 --- a/Transceiver52M/x86/convolve.c +++ b/Transceiver52M/x86/convolve.c @@ -47,12 +47,20 @@ #include <pmmintrin.h> /* 4-tap SSE complex-real convolution */ -static void sse_conv_real4(const float *restrict x, - const float *restrict h, - float *restrict y, - int len) +static void sse_conv_real4(const float *x, int x_len, + const float *h, int h_len, + float *y, int y_len, + int start, int len, + int step, int offset) { + /* NOTE: The parameter list of this function has to match the parameter + * list of _base_convolve_real() in convolve_base.c. This specific + * implementation, ignores some of the parameters of + * _base_convolve_complex(), which are: x_len, y_len, offset, step */ + __m128 m0, m1, m2, m3, m4, m5, m6, m7; + + const float *_x = &x[2 * (-(h_len - 1) + start)]; /* Load (aligned) filter taps */ m0 = _mm_load_ps(&h[0]); @@ -61,8 +69,8 @@ for (int i = 0; i < len; i++) { /* Load (unaligned) input data */ - m0 = _mm_loadu_ps(&x[2 * i + 0]); - m1 = _mm_loadu_ps(&x[2 * i + 4]); + m0 = _mm_loadu_ps(&_x[2 * i + 0]); + m1 = _mm_loadu_ps(&_x[2 * i + 4]); m2 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); m3 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3)); @@ -81,12 +89,17 @@ } /* 8-tap SSE complex-real convolution */ -static void sse_conv_real8(const float *restrict x, - const float *restrict h, - float *restrict y, - int len) +static void sse_conv_real8(const float *x, int x_len, + const float *h, int h_len, + float *y, int y_len, + int start, int len, + int step, int offset) { + /* See NOTE in sse_conv_real4() */ + __m128 m0, m1, m2, m3, m4, m5, m6, m7, m8, m9; + + const float *_x = &x[2 * (-(h_len - 1) + start)]; /* Load (aligned) filter taps */ m0 = _mm_load_ps(&h[0]); @@ -99,10 +112,10 @@ for (int i = 0; i < len; i++) { /* Load (unaligned) input data */ - m0 = _mm_loadu_ps(&x[2 * i + 0]); - m1 = _mm_loadu_ps(&x[2 * i + 4]); - m2 = _mm_loadu_ps(&x[2 * i + 8]); - m3 = _mm_loadu_ps(&x[2 * i + 12]); + m0 = _mm_loadu_ps(&_x[2 * i + 0]); + m1 = _mm_loadu_ps(&_x[2 * i + 4]); + m2 = _mm_loadu_ps(&_x[2 * i + 8]); + m3 = _mm_loadu_ps(&_x[2 * i + 12]); m6 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); m7 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3)); @@ -128,13 +141,18 @@ } /* 12-tap SSE complex-real convolution */ -static void sse_conv_real12(const float *restrict x, - const float *restrict h, - float *restrict y, - int len) +static void sse_conv_real12(const float *x, int x_len, + const float *h, int h_len, + float *y, int y_len, + int start, int len, + int step, int offset) { + /* See NOTE in sse_conv_real4() */ + __m128 m0, m1, m2, m3, m4, m5, m6, m7; __m128 m8, m9, m10, m11, m12, m13, m14; + + const float *_x = &x[2 * (-(h_len - 1) + start)]; /* Load (aligned) filter taps */ m0 = _mm_load_ps(&h[0]); @@ -150,18 +168,18 @@ for (int i = 0; i < len; i++) { /* Load (unaligned) input data */ - m0 = _mm_loadu_ps(&x[2 * i + 0]); - m1 = _mm_loadu_ps(&x[2 * i + 4]); - m2 = _mm_loadu_ps(&x[2 * i + 8]); - m3 = _mm_loadu_ps(&x[2 * i + 12]); + m0 = _mm_loadu_ps(&_x[2 * i + 0]); + m1 = _mm_loadu_ps(&_x[2 * i + 4]); + m2 = _mm_loadu_ps(&_x[2 * i + 8]); + m3 = _mm_loadu_ps(&_x[2 * i + 12]); m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3)); m6 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2)); m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3)); - m0 = _mm_loadu_ps(&x[2 * i + 16]); - m1 = _mm_loadu_ps(&x[2 * i + 20]); + m0 = _mm_loadu_ps(&_x[2 * i + 16]); + m1 = _mm_loadu_ps(&_x[2 * i + 20]); m8 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); m9 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3)); @@ -190,13 +208,18 @@ } /* 16-tap SSE complex-real convolution */ -static void sse_conv_real16(const float *restrict x, - const float *restrict h, - float *restrict y, - int len) +static void sse_conv_real16(const float *x, int x_len, + const float *h, int h_len, + float *y, int y_len, + int start, int len, + int step, int offset) { + /* See NOTE in sse_conv_real4() */ + __m128 m0, m1, m2, m3, m4, m5, m6, m7; __m128 m8, m9, m10, m11, m12, m13, m14, m15; + + const float *_x = &x[2 * (-(h_len - 1) + start)]; /* Load (aligned) filter taps */ m0 = _mm_load_ps(&h[0]); @@ -216,20 +239,20 @@ for (int i = 0; i < len; i++) { /* Load (unaligned) input data */ - m0 = _mm_loadu_ps(&x[2 * i + 0]); - m1 = _mm_loadu_ps(&x[2 * i + 4]); - m2 = _mm_loadu_ps(&x[2 * i + 8]); - m3 = _mm_loadu_ps(&x[2 * i + 12]); + m0 = _mm_loadu_ps(&_x[2 * i + 0]); + m1 = _mm_loadu_ps(&_x[2 * i + 4]); + m2 = _mm_loadu_ps(&_x[2 * i + 8]); + m3 = _mm_loadu_ps(&_x[2 * i + 12]); m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3)); m6 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(0, 2, 0, 2)); m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3)); - m0 = _mm_loadu_ps(&x[2 * i + 16]); - m1 = _mm_loadu_ps(&x[2 * i + 20]); - m2 = _mm_loadu_ps(&x[2 * i + 24]); - m3 = _mm_loadu_ps(&x[2 * i + 28]); + m0 = _mm_loadu_ps(&_x[2 * i + 16]); + m1 = _mm_loadu_ps(&_x[2 * i + 20]); + m2 = _mm_loadu_ps(&_x[2 * i + 24]); + m3 = _mm_loadu_ps(&_x[2 * i + 28]); m8 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); m9 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3)); @@ -265,13 +288,18 @@ } /* 20-tap SSE complex-real convolution */ -static void sse_conv_real20(const float *restrict x, - const float *restrict h, - float *restrict y, - int len) +static void sse_conv_real20(const float *x, int x_len, + const float *h, int h_len, + float *y, int y_len, + int start, int len, + int step, int offset) { + /* See NOTE in sse_conv_real4() */ + __m128 m0, m1, m2, m3, m4, m5, m6, m7; __m128 m8, m9, m11, m12, m13, m14, m15; + + const float *_x = &x[2 * (-(h_len - 1) + start)]; /* Load (aligned) filter taps */ m0 = _mm_load_ps(&h[0]); @@ -293,12 +321,12 @@ for (int i = 0; i < len; i++) { /* Multiply-accumulate first 12 taps */ - m0 = _mm_loadu_ps(&x[2 * i + 0]); - m1 = _mm_loadu_ps(&x[2 * i + 4]); - m2 = _mm_loadu_ps(&x[2 * i + 8]); - m3 = _mm_loadu_ps(&x[2 * i + 12]); - m4 = _mm_loadu_ps(&x[2 * i + 16]); - m5 = _mm_loadu_ps(&x[2 * i + 20]); + m0 = _mm_loadu_ps(&_x[2 * i + 0]); + m1 = _mm_loadu_ps(&_x[2 * i + 4]); + m2 = _mm_loadu_ps(&_x[2 * i + 8]); + m3 = _mm_loadu_ps(&_x[2 * i + 12]); + m4 = _mm_loadu_ps(&_x[2 * i + 16]); + m5 = _mm_loadu_ps(&_x[2 * i + 20]); m6 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); m7 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3)); @@ -320,10 +348,10 @@ m9 = _mm_add_ps(m1, m7); /* Multiply-accumulate last 8 taps */ - m0 = _mm_loadu_ps(&x[2 * i + 24]); - m1 = _mm_loadu_ps(&x[2 * i + 28]); - m2 = _mm_loadu_ps(&x[2 * i + 32]); - m3 = _mm_loadu_ps(&x[2 * i + 36]); + m0 = _mm_loadu_ps(&_x[2 * i + 24]); + m1 = _mm_loadu_ps(&_x[2 * i + 28]); + m2 = _mm_loadu_ps(&_x[2 * i + 32]); + m3 = _mm_loadu_ps(&_x[2 * i + 36]); m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3)); @@ -351,12 +379,17 @@ } /* 4*N-tap SSE complex-real convolution */ -static void sse_conv_real4n(const float *x, - const float *h, - float *y, - int h_len, int len) +static void sse_conv_real4n(const float *x, int x_len, + const float *h, int h_len, + float *y, int y_len, + int start, int len, + int step, int offset) { + /* See NOTE in sse_conv_real4() */ + __m128 m0, m1, m2, m4, m5, m6, m7; + + const float *_x = &x[2 * (-(h_len - 1) + start)]; for (int i = 0; i < len; i++) { /* Zero */ @@ -370,8 +403,8 @@ m2 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); /* Load (unaligned) input data */ - m0 = _mm_loadu_ps(&x[2 * i + 8 * n + 0]); - m1 = _mm_loadu_ps(&x[2 * i + 8 * n + 4]); + m0 = _mm_loadu_ps(&_x[2 * i + 8 * n + 0]); + m1 = _mm_loadu_ps(&_x[2 * i + 8 * n + 4]); m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3)); @@ -394,12 +427,20 @@ } /* 4*N-tap SSE complex-complex convolution */ -static void sse_conv_cmplx_4n(const float *x, - const float *h, - float *y, - int h_len, int len) +static void sse_conv_cmplx_4n(const float *x, int x_len, + const float *h, int h_len, + float *y, int y_len, + int start, int len, + int step, int offset) { + /* NOTE: The parameter list of this function has to match the parameter + * list of _base_convolve_complex() in convolve_base.c. This specific + * implementation, ignores some of the parameters of + * _base_convolve_complex(), which are: x_len, y_len, offset, step. */ + __m128 m0, m1, m2, m3, m4, m5, m6, m7; + + const float *_x = &x[2 * (-(h_len - 1) + start)]; for (int i = 0; i < len; i++) { /* Zero */ @@ -414,8 +455,8 @@ m3 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3)); /* Load (unaligned) input data */ - m0 = _mm_loadu_ps(&x[2 * i + 8 * n + 0]); - m1 = _mm_loadu_ps(&x[2 * i + 8 * n + 4]); + m0 = _mm_loadu_ps(&_x[2 * i + 8 * n + 0]); + m1 = _mm_loadu_ps(&_x[2 * i + 8 * n + 4]); m4 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); m5 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3)); @@ -445,13 +486,18 @@ } /* 8*N-tap SSE complex-complex convolution */ -static void sse_conv_cmplx_8n(const float *x, - const float *h, - float *y, - int h_len, int len) +static void sse_conv_cmplx_8n(const float *x, int x_len, + const float *h, int h_len, + float *y, int y_len, + int start, int len, + int step, int offset) { + /* See NOTE in sse_conv_cmplx_4n() */ + __m128 m0, m1, m2, m3, m4, m5, m6, m7; __m128 m8, m9, m10, m11, m12, m13, m14, m15; + + const float *_x = &x[2 * (-(h_len - 1) + start)]; for (int i = 0; i < len; i++) { /* Zero */ @@ -473,10 +519,10 @@ m7 = _mm_shuffle_ps(m2, m3, _MM_SHUFFLE(1, 3, 1, 3)); /* Load (unaligned) input data */ - m0 = _mm_loadu_ps(&x[2 * i + 16 * n + 0]); - m1 = _mm_loadu_ps(&x[2 * i + 16 * n + 4]); - m2 = _mm_loadu_ps(&x[2 * i + 16 * n + 8]); - m3 = _mm_loadu_ps(&x[2 * i + 16 * n + 12]); + m0 = _mm_loadu_ps(&_x[2 * i + 16 * n + 0]); + m1 = _mm_loadu_ps(&_x[2 * i + 16 * n + 4]); + m2 = _mm_loadu_ps(&_x[2 * i + 16 * n + 8]); + m3 = _mm_loadu_ps(&_x[2 * i + 16 * n + 12]); m8 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(0, 2, 0, 2)); m9 = _mm_shuffle_ps(m0, m1, _MM_SHUFFLE(1, 3, 1, 3)); @@ -522,14 +568,10 @@ /* API: Aligned complex-real */ int convolve_real(const float *x, int x_len, const float *h, int h_len, - float *y, int y_len, - int start, int len, - int step, int offset) + float *y, int y_len, int start, int len, int step, int offset) { - void (*conv_func)(const float *, const float *, - float *, int) = NULL; - void (*conv_func_n)(const float *, const float *, - float *, int, int) = NULL; + void (*conv_func) (const float *, int, const float *, int, float *, int, + int, int, int, int) = (void *)_base_convolve_real; if (bounds_check(x_len, h_len, y_len, start, len, step) < 0) return -1; @@ -556,22 +598,12 @@ break; default: if (!(h_len % 4)) - conv_func_n = sse_conv_real4n; + conv_func = sse_conv_real4n; } } #endif - if (conv_func) { - conv_func(&x[2 * (-(h_len - 1) + start)], - h, y, len); - } else if (conv_func_n) { - conv_func_n(&x[2 * (-(h_len - 1) + start)], - h, y, h_len, len); - } else { - _base_convolve_real(x, x_len, - h, h_len, - y, y_len, - start, len, step, offset); - } + + conv_func(x, x_len, h, h_len, y, y_len, start, len, step, offset); return len; } @@ -580,11 +612,11 @@ int convolve_complex(const float *x, int x_len, const float *h, int h_len, float *y, int y_len, - int start, int len, - int step, int offset) + int start, int len, int step, int offset) { - void (*conv_func)(const float *, const float *, - float *, int, int) = NULL; + void (*conv_func) (const float *, int, const float *, int, float *, int, + int, int, int, int) = + (void *)_base_convolve_complex; if (bounds_check(x_len, h_len, y_len, start, len, step) < 0) return -1; @@ -599,15 +631,8 @@ conv_func = sse_conv_cmplx_4n; } #endif - if (conv_func) { - conv_func(&x[2 * (-(h_len - 1) + start)], - h, y, h_len, len); - } else { - _base_convolve_complex(x, x_len, - h, h_len, - y, y_len, - start, len, step, offset); - } + + conv_func(x, x_len, h, h_len, y, y_len, start, len, step, offset); return len; } -- To view, visit https://gerrit.osmocom.org/2099 To unsubscribe, visit https://gerrit.osmocom.org/settings Gerrit-MessageType: merged Gerrit-Change-Id: Ice063b89791537c4b591751f12f5ef5c413a2d27 Gerrit-PatchSet: 4 Gerrit-Project: osmo-trx Gerrit-Branch: master Gerrit-Owner: dexter <pmaier at sysmocom.de> Gerrit-Reviewer: Harald Welte <laforge at gnumonks.org> Gerrit-Reviewer: Jenkins Builder Gerrit-Reviewer: Tom Tsou <tom at tsou.cc>