Nicolas Tancogne-Dejean
--- a/share/opencl/projector.cl

+ 14

− 26
+++ b/share/opencl/projector.cl

+ 14

− 26
 @@ -42,7 +42,7 @@
 #endif


-__device__ inline double dwarpReduce(double val)
+__device__ inline rtype X(warpReduce)(rtype val)
 {
 #pragma unroll
  for (int offset = warpSize/2; offset > 0; offset /= 2){
 @@ -51,18 +51,6 @@ __device__ inline double dwarpReduce(double val)
  return val;
 }

-__device__ inline double2 zwarpReduce(double2 val)
-{
-#pragma unroll
-  for (int offset = warpSize/2; offset > 0; offset /= 2){
-    val.x += warpShflDown(val.x, offset);
-    val.y += warpShflDown(val.y, offset);
-  }
-  return val;
-}
-
-#endif
-

 __kernel void X(projector_bra)(const int nmat,
          __global int const * restrict offsets,
 @@ -79,8 +67,8 @@ __kernel void X(projector_bra)(const int nmat,
  const int my_warp_size = 1;
 #endif

-  const int ist = get_global_id(0)/my_warp_size;
-  const int ipj = get_global_id(1);
+  const int ist  = get_global_id(0) / my_warp_size;
+  const int ipj  = get_global_id(1);
  const int imat = get_global_id(2);

  const int npoints       = offsets[OFFSET_SIZE*imat + 0];
 @@ -91,27 +79,27 @@ __kernel void X(projector_bra)(const int nmat,

  if(ipj >= nprojs) return;

-  const int nppj = npoints*ipj;
+  const int nppj = npoints * ipj;

 #ifdef CUDA
-  const int slice = npoints%my_warp_size==0 ? npoints/my_warp_size : npoints/my_warp_size+1;
-  const int start = slice * ( get_local_id(0)%my_warp_size ) ;
-  const int end   = min( start + slice , npoints );
+  const int slice = npoints%my_warp_size == 0 ? npoints/my_warp_size : npoints/my_warp_size+1;
+  const int start = slice * (get_local_id(0)%my_warp_size) ;
+  const int end   = min(start + slice, npoints);
  const int step  = 1;
 #else
  const int start = 0;
-  const int end = npoints;
-  const int step = 1;
+  const int end   = npoints;
+  const int step  = 1;
 #endif

  rtype aa = 0.0;
-  for(int ip = start; ip < end; ip+=step){
+  for(int ip = start; ip < end; ip += step){
    aa += MUL(CONJ(matrix[matrix_offset + ip + nppj]), psi[((map[map_offset + ip] - 1)<<ldpsi) + ist]);
  }

 #ifdef CUDA
  aa = X(warpReduce)(aa);
-  if(get_local_id(0)%my_warp_size==0)
+  if(get_local_id(0)%my_warp_size == 0)
 #endif
    projection[ist + ((scal_offset + ipj)<<ldprojection)] = MUL(scal[scal_offset + ipj], aa);

 @@ -674,9 +662,9 @@ __kernel void zprojector_mix(const int nmat,
  const int ipj  = get_global_id(1);
  const int imat = get_global_id(2);

-  const int nprojs        = offsets[OFFSET_SIZE*imat + 1];
-  const int scal_offset   = offsets[OFFSET_SIZE*imat + 4];
-  const int mix_offset_0  = offsets[OFFSET_SIZE*imat + 5];
+  const int nprojs       = offsets[OFFSET_SIZE*imat + 1];
+  const int scal_offset  = offsets[OFFSET_SIZE*imat + 4];
+  const int mix_offset_0 = offsets[OFFSET_SIZE*imat + 5];
  const int mix_offset_1 = mix_offset_0 + nprojs*nprojs;
  const int mix_offset_2 = mix_offset_1 + nprojs*nprojs;
  const int mix_offset_3 = mix_offset_2 + nprojs*nprojs;