Skip to content
Snippets Groups Projects

Fix OEP full - spin polarized case

Merged Nicolas Tancogne-Dejean requested to merge fix_oep_polarized into hotfix-12.2
2 files
+ 16
29
Compare changes
  • Side-by-side
  • Inline
Files
2
+ 14
26
@@ -42,7 +42,7 @@
#endif
__device__ inline double dwarpReduce(double val)
__device__ inline rtype X(warpReduce)(rtype val)
{
#pragma unroll
for (int offset = warpSize/2; offset > 0; offset /= 2){
@@ -51,18 +51,6 @@ __device__ inline double dwarpReduce(double val)
return val;
}
__device__ inline double2 zwarpReduce(double2 val)
{
#pragma unroll
for (int offset = warpSize/2; offset > 0; offset /= 2){
val.x += warpShflDown(val.x, offset);
val.y += warpShflDown(val.y, offset);
}
return val;
}
#endif
__kernel void X(projector_bra)(const int nmat,
__global int const * restrict offsets,
@@ -79,8 +67,8 @@ __kernel void X(projector_bra)(const int nmat,
const int my_warp_size = 1;
#endif
const int ist = get_global_id(0)/my_warp_size;
const int ipj = get_global_id(1);
const int ist = get_global_id(0) / my_warp_size;
const int ipj = get_global_id(1);
const int imat = get_global_id(2);
const int npoints = offsets[OFFSET_SIZE*imat + 0];
@@ -91,27 +79,27 @@ __kernel void X(projector_bra)(const int nmat,
if(ipj >= nprojs) return;
const int nppj = npoints*ipj;
const int nppj = npoints * ipj;
#ifdef CUDA
const int slice = npoints%my_warp_size==0 ? npoints/my_warp_size : npoints/my_warp_size+1;
const int start = slice * ( get_local_id(0)%my_warp_size ) ;
const int end = min( start + slice , npoints );
const int slice = npoints%my_warp_size == 0 ? npoints/my_warp_size : npoints/my_warp_size+1;
const int start = slice * (get_local_id(0)%my_warp_size) ;
const int end = min(start + slice, npoints);
const int step = 1;
#else
const int start = 0;
const int end = npoints;
const int step = 1;
const int end = npoints;
const int step = 1;
#endif
rtype aa = 0.0;
for(int ip = start; ip < end; ip+=step){
for(int ip = start; ip < end; ip += step){
aa += MUL(CONJ(matrix[matrix_offset + ip + nppj]), psi[((map[map_offset + ip] - 1)<<ldpsi) + ist]);
}
#ifdef CUDA
aa = X(warpReduce)(aa);
if(get_local_id(0)%my_warp_size==0)
if(get_local_id(0)%my_warp_size == 0)
#endif
projection[ist + ((scal_offset + ipj)<<ldprojection)] = MUL(scal[scal_offset + ipj], aa);
@@ -674,9 +662,9 @@ __kernel void zprojector_mix(const int nmat,
const int ipj = get_global_id(1);
const int imat = get_global_id(2);
const int nprojs = offsets[OFFSET_SIZE*imat + 1];
const int scal_offset = offsets[OFFSET_SIZE*imat + 4];
const int mix_offset_0 = offsets[OFFSET_SIZE*imat + 5];
const int nprojs = offsets[OFFSET_SIZE*imat + 1];
const int scal_offset = offsets[OFFSET_SIZE*imat + 4];
const int mix_offset_0 = offsets[OFFSET_SIZE*imat + 5];
const int mix_offset_1 = mix_offset_0 + nprojs*nprojs;
const int mix_offset_2 = mix_offset_1 + nprojs*nprojs;
const int mix_offset_3 = mix_offset_2 + nprojs*nprojs;
Loading