gr-fosphor: error: work group size exceeds the maximum default value for the selected device (AMD HD 5450)

historical

Hi,

> I just compiled gr-fosphor on a PC equipped with a HD 5450 card, and when I try to run osmocom_fft -F I gut the following error:

> [+] Selected device: Cedar
> Build log for 'display.cl':
> "/tmp/OCL2287T8.cl", line 67: error: work group size exceeds the maximum
>           default value for the selected device
>   __attribute__((reqd_work_group_size(16, 16, 1)))
>                  ^
>
> 1 error detected in the compilation of "/tmp/OCL2287T8.cl".
> Frontend phase failed compilation.

Yes, that card only suppot 128 as a maximum work group size and
fosphor is written to use 256. I think this particular generation of
card is the only one that's new enough to support OpenCL but old
enough to not support WG of 256.

I did however write a patch to support it a while back and just never
bothered to clean it up (to select size dynamically depending on your
card).

Try the attached patch and see if it works for you.


Cheers,

   Sylvain
-------------- next part --------------
commit 5b2d6c6abef2680e367fc315fb32f9b51fe5de8f
Author: Sylvain Munaut <tnt at 246tNt.com>
Date:   Sat Oct 19 23:23:57 2013 +0200

    [hack] Make display kernel 8x8

diff --git a/fosphor/cl.c b/fosphor/cl.c
index 3160dc4..530d3a2 100644
--- a/fosphor/cl.c
+++ b/fosphor/cl.c
@@ -679,9 +679,9 @@ fosphor_cl_process(struct fosphor *self,
 
 	/* Execute display kernel */
 	global[0] = FOSPHOR_FFT_LEN;
-	global[1] = 16;
-	local[0] = 16;
-	local[1] = 16;
+	global[1] = 8;
+	local[0] = 8;
+	local[1] = 8;
 
 	err = clEnqueueNDRangeKernel(cl->cq, cl->kern_display, 2, NULL, global, local, 0, NULL, NULL);
 	CL_ERR_CHECK(err, "Unable to queue display kernel execution");
diff --git a/fosphor/display.cl b/fosphor/display.cl
index dd19bba..82d210f 100644
--- a/fosphor/display.cl
+++ b/fosphor/display.cl
@@ -42,6 +42,9 @@
 //#define MAX_HOLD_NORMAL
 #define MAX_HOLD_DECAY
 
+#define QS_LOG	3
+#define QS	(1<<QS_LOG)
+
 
 #ifdef USE_NV_SM11_ATOMICS
 
@@ -64,7 +67,7 @@ inline void nv_sm11_atomic_inc(volatile __local uint *p, uint tag)
 #endif /* USE_NV_SM11_ATOMICS */
 
 
-__attribute__((reqd_work_group_size(16, 16, 1)))
+__attribute__((reqd_work_group_size(QS, QS, 1)))
 __kernel void display(
 	/* FFT Input */
 	__global const float2 *fft,		/* [ 0] Input FFT (complex)      */
@@ -91,20 +94,20 @@ __kernel void display(
 	float max_pwr = - 1000.0f;
 
 	/* Local memory */
-	__local float live_buf[16 * 16];	/* get_local_size(0) * get_local_size(1) */
-	__local float max_buf[16 * 16];		/* get_local_size(0) * get_local_size(1) */
-	__local uint  histo_buf[16 * 128];
+	__local float live_buf[QS * QS];	/* get_local_size(0) * get_local_size(1) */
+	__local float max_buf[QS * QS];		/* get_local_size(0) * get_local_size(1) */
+	__local uint  histo_buf[QS * 128];
 
 	/* Local shortcuts */
 	const float live_one_minus_alpha = 1.0f - live_alpha;
 
 	/* Transposition & Atomic emulation */
 #ifdef USE_NV_SM11_ATOMICS
-	__local float pwr_buf[16 * 16];		/* pwr transpose buffer */
+	__local float pwr_buf[QS * QS];		/* pwr transpose buffer */
 
-	uint tib = (get_local_id(0) + get_local_id(1)) & 15;
-	uint ti0 = tib | (get_local_id(0) << 4);
-	uint ti1 = tib | (get_local_id(1) << 4);
+	uint tib = (get_local_id(0) + get_local_id(1)) & (QS-1);
+	uint ti0 = tib | (get_local_id(0) << QS_LOG);
+	uint ti1 = tib | (get_local_id(1) << QS_LOG);
 
 	const uint tag =  get_local_id(0) << (UINT_BITS - LOG2_WARP_SIZE);
 #endif
@@ -114,6 +117,24 @@ __kernel void display(
 
 	__local uint *h = &histo_buf[get_local_id(1) * get_local_size(0) + get_local_id(0)];
 
+#if QS == 8
+	h[   0] = 0;
+	h[  64] = 0;
+	h[ 128] = 0;
+	h[ 192] = 0;
+	h[ 256] = 0;
+	h[ 320] = 0;
+	h[ 384] = 0;
+	h[ 448] = 0;
+	h[ 512] = 0;
+	h[ 576] = 0;
+	h[ 640] = 0;
+	h[ 704] = 0;
+	h[ 768] = 0;
+	h[ 832] = 0;
+	h[ 896] = 0;
+	h[ 960] = 0;
+#elif QS == 16
 	h[   0] = 0;
 	h[ 256] = 0;
 	h[ 512] = 0;
@@ -122,6 +143,7 @@ __kernel void display(
 	h[1280] = 0;
 	h[1536] = 0;
 	h[1792] = 0;
+#endif
 
 	/* Wait for all clears to be done by everyone */
 	barrier(CLK_LOCAL_MEM_FENCE);
@@ -169,11 +191,11 @@ __kernel void display(
 
 		/* Atomic Bin increment */
 #if defined(USE_NV_SM11_ATOMICS)
-		nv_sm11_atomic_inc(&histo_buf[(bin << 4) + get_local_id(1)], tag);
+		nv_sm11_atomic_inc(&histo_buf[(bin << QS_LOG) + get_local_id(1)], tag);
 #elif defined(USE_EXT_ATOMICS)
-		atom_inc(&histo_buf[(bin << 4) + get_local_id(0)]);
+		atom_inc(&histo_buf[(bin << QS_LOG) + get_local_id(0)]);
 #else
-		atomic_inc(&histo_buf[(bin << 4) + get_local_id(0)]);
+		atomic_inc(&histo_buf[(bin << QS_LOG) + get_local_id(0)]);
 #endif
 	}