Commit 3e6b4f8e authored by degironc's avatar degironc

MAJOR restructuring of the FFTXlib library

In real space processors are organized in a 2D pattern.

Each processor owns data from a sub-set of Z-planes and a sub-set of Y-planes.
In reciprocal space each processor owns Z-columns that belong to a sub set of
X-values. This allows to split the processors in two sets for communication
in the YZ and XY planes.
In alternative, if the situation allows for it, a task group paralelization is used
(with ntg=nyfft) where complete XY planes of ntg wavefunctions are collected and Fourier
trasnformed in G space by different task-groups. This is preferable to the Z-proc + Y-proc
paralleization if task group can be used because a smaller number of larger ammounts of 
data are transferred. Hence three types of fft are implemented: 
 
  !
  !! ... isgn = +-1 : parallel 3d fft for rho and for the potential
  !
  !! ... isgn = +-2 : parallel 3d fft for wavefunctions
  !
  !! ... isgn = +-3 : parallel 3d fft for wavefunctions with task group
  !
  !! ... isgn = +   : G-space to R-space, output = \sum_G f(G)exp(+iG*R)
  !! ...              fft along z using pencils        (cft_1z)
  !! ...              transpose across nodes           (fft_scatter_yz)
  !! ...              fft along y using pencils        (cft_1y)
  !! ...              transpose across nodes           (fft_scatter_xy)
  !! ...              fft along x using pencils        (cft_1x)
  !
  !! ... isgn = -   : R-space to G-space, output = \int_R f(R)exp(-iG*R)/Omega
  !! ...              fft along x using pencils        (cft_1x)
  !! ...              transpose across nodes           (fft_scatter_xy)
  !! ...              fft along y using pencils        (cft_1y)
  !! ...              transpose across nodes           (fft_scatter_yz)
  !! ...              fft along z using pencils        (cft_1z)
  !
  ! If task_group_fft_is_active the FFT acts on a number of wfcs equal to 
  ! dfft%nproc2, the number of Y-sections in which a plane is divided. 
  ! Data are reshuffled by the fft_scatter_tg routine so that each of the 
  ! dfft%nproc2 subgroups (made by dfft%nproc3 procs) deals with whole planes 
  ! of a single wavefunciton.
  !

fft_type module heavily modified, a number of variables renamed with more intuitive names 
(at least to me), a number of more variables introduced for the Y-proc parallelization.

Task_group module made void. task_group management is now reduced to the logical component
 fft_desc%have_task_groups of fft_type_descriptor type variable fft_desc.

In term of interfaces, the 'easy' calling sequences are

SUBROUTINE invfft/fwfft( grid_type, f, dfft, howmany )

  !! where:
  !! 
  !! **grid_type = 'Dense'** : 
  !!   inverse/direct fourier transform of potentials and charge density f
  !!   on the dense grid (dfftp). On output, f is overwritten
  !! 
  !! **grid_type = 'Smooth'** :
  !!   inverse/direct fourier transform of  potentials and charge density f
  !!   on the smooth grid (dffts). On output, f is overwritten
  !! 
  !! **grid_type = 'Wave'** :
  !!   inverse/direct fourier transform of  wave functions f
  !!   on the smooth grid (dffts). On output, f is overwritten
  !!
  !! **grid_type = 'tgWave'** :
  !!   inverse/direct fourier transform of  wave functions f with task group
  !!   on the smooth grid (dffts). On output, f is overwritten
  !!
  !! **grid_type = 'Custom'** : 
  !!   inverse/direct fourier transform of potentials and charge density f
  !!   on a custom grid (dfft_exx). On output, f is overwritten
  !! 
  !! **grid_type = 'CustomWave'** :
  !!   inverse/direct fourier transform of  wave functions f
  !!   on a custom grid (dfft_exx). On output, f is overwritten
  !! 
  !! **dfft = FFT descriptor**, IMPORTANT NOTICE: grid is specified only by dfft.
  !!   No check is performed on the correspondence between dfft and grid_type.
  !!   grid_type is now used only to distinguish cases 'Wave' / 'CustomWave' 
  !!   from all other cases
                                                                                                 

Many more files modified.




git-svn-id: http://qeforge.qe-forge.org/svn/q-e/trunk/espresso@13676 c92efa57-630b-4861-b058-cf58834340f0
parent 9cd88c5e
...@@ -97,6 +97,7 @@ cat > h2o.scf.in << EOF ...@@ -97,6 +97,7 @@ cat > h2o.scf.in << EOF
ekin_conv_thr = 1.0d-8 ekin_conv_thr = 1.0d-8
etot_conv_thr = 1.0d-6 etot_conv_thr = 1.0d-6
forc_conv_thr = 1.0d-2 forc_conv_thr = 1.0d-2
verbosity="debug"
/ /
&system &system
ibrav= 1, celldm(1)=16.0, nat= 3, ntyp= 2, ibrav= 1, celldm(1)=16.0, nat= 3, ntyp= 2,
......
...@@ -117,7 +117,7 @@ ...@@ -117,7 +117,7 @@
USE cg_module, ONLY: tcg USE cg_module, ONLY: tcg
USE cp_interfaces, ONLY: stress_kin, enkin USE cp_interfaces, ONLY: stress_kin, enkin
USE fft_interfaces, ONLY: fwfft, invfft USE fft_interfaces, ONLY: fwfft, invfft
USE fft_base, ONLY: dffts, dfftp, dfft3d, dtgs USE fft_base, ONLY: dffts, dfftp, dfft3d
USE cp_interfaces, ONLY: checkrho, ennl, calrhovan, dennl USE cp_interfaces, ONLY: checkrho, ennl, calrhovan, dennl
USE cp_main_variables, ONLY: iprint_stdout, descla USE cp_main_variables, ONLY: iprint_stdout, descla
USE wannier_base, ONLY: iwf USE wannier_base, ONLY: iwf
...@@ -152,6 +152,7 @@ ...@@ -152,6 +152,7 @@
! local variables ! local variables
INTEGER :: iss, isup, isdw, iss1, iss2, ios, i, ir, ig, k INTEGER :: iss, isup, isdw, iss1, iss2, ios, i, ir, ig, k
INTEGER :: ioff, ioff_tg, nxyp, ir3
REAL(DP) :: rsumr(2), rsumg(2), sa1, sa2, detmp(6), mtmp(3,3) REAL(DP) :: rsumr(2), rsumg(2), sa1, sa2, detmp(6), mtmp(3,3)
REAL(DP) :: rnegsum, rmin, rmax, rsum REAL(DP) :: rnegsum, rmin, rmax, rsum
COMPLEX(DP) :: ci,fp,fm COMPLEX(DP) :: ci,fp,fm
...@@ -348,6 +349,7 @@ ...@@ -348,6 +349,7 @@
END DO END DO
!$omp end parallel do !$omp end parallel do
CALL fwfft('Smooth', psis, dffts ) CALL fwfft('Smooth', psis, dffts )
!$omp parallel do !$omp parallel do
DO ig=1,ngms DO ig=1,ngms
rhog(ig,iss)=psis(nls(ig)) rhog(ig,iss)=psis(nls(ig))
...@@ -512,9 +514,8 @@ ...@@ -512,9 +514,8 @@
SUBROUTINE loop_over_states SUBROUTINE loop_over_states
! !
USE parallel_include USE parallel_include
USE fft_parallel, ONLY: pack_group_sticks, fw_tg_cft3_z, fw_tg_cft3_scatter, fw_tg_cft3_xy ! USE fft_scalar, ONLY: cfft3ds
USE fft_scalar, ONLY: cfft3ds ! USE scatter_mod, ONLY: maps_sticks_to_3d
USE scatter_mod, ONLY: maps_sticks_to_3d
! !
! MAIN LOOP OVER THE EIGENSTATES ! MAIN LOOP OVER THE EIGENSTATES
! - This loop is also parallelized within the task-groups framework ! - This loop is also parallelized within the task-groups framework
...@@ -532,15 +533,15 @@ ...@@ -532,15 +533,15 @@
REAL(DP), ALLOCATABLE :: tmp_rhos(:,:) REAL(DP), ALLOCATABLE :: tmp_rhos(:,:)
COMPLEX(DP), ALLOCATABLE :: aux(:) COMPLEX(DP), ALLOCATABLE :: aux(:)
ALLOCATE( psis( dtgs%tg_nnr * dtgs%nogrp ) ) ALLOCATE( psis( dffts%nnr_tg ) )
ALLOCATE( aux( dtgs%tg_nnr * dtgs%nogrp ) ) ALLOCATE( aux( dffts%nnr_tg ) )
! !
ALLOCATE( tmp_rhos ( dffts%nr1x * dffts%nr2x * dtgs%tg_npp( me_bgrp + 1 ), nspin ) ) ALLOCATE( tmp_rhos ( dffts%nr1x * dffts%nr2x * dffts%my_nr3p, nspin ) )
! !
tmp_rhos = 0_DP tmp_rhos = 0_DP
do i = 1, nbsp_bgrp, 2*dffts%nproc2
do i = 1, nbsp_bgrp, 2*dtgs%nogrp
! !
! Initialize wave-functions in Fourier space (to be FFTed) ! Initialize wave-functions in Fourier space (to be FFTed)
! The size of psis is nnr: which is equal to the total number ! The size of psis is nnr: which is equal to the total number
...@@ -566,7 +567,7 @@ ...@@ -566,7 +567,7 @@
!!$omp parallel !!$omp parallel
!!$omp single !!$omp single
do eig_index = 1, 2*dtgs%nogrp, 2 do eig_index = 1, 2*dffts%nproc2, 2
! !
!!$omp task default(none) & !!$omp task default(none) &
!!$omp firstprivate( i, eig_offset, nbsp_bgrp, ngw, eig_index ) & !!$omp firstprivate( i, eig_offset, nbsp_bgrp, ngw, eig_index ) &
...@@ -580,8 +581,8 @@ ...@@ -580,8 +581,8 @@
! !
! The eig_index loop is executed only ONCE when NOGRP=1. ! The eig_index loop is executed only ONCE when NOGRP=1.
! !
CALL c2psi( aux(eig_offset*dtgs%tg_nnr+1), dtgs%tg_nnr, & CALL c2psi( psis(eig_offset*dffts%nnr+1), dffts%nnr, &
c_bgrp( 1, i+eig_index-1 ), c_bgrp( 1, i+eig_index ), ngw, 2 ) c_bgrp( 1, i+eig_index-1 ), c_bgrp( 1, i+eig_index ), ngw, 2 )
! !
ENDIF ENDIF
!!$omp end task !!$omp end task
...@@ -599,23 +600,18 @@ ...@@ -599,23 +600,18 @@
! !
! now redistribute data ! now redistribute data
! !
! ! IF( dffts%nproc2 == dffts%nproc ) THEN
IF( dtgs%nogrp == dtgs%nproc ) THEN ! CALL pack_group_sticks( aux, psis )
CALL pack_group_sticks( aux, psis, dtgs ) ! CALL maps_sticks_to_3d( dffts, psis, SIZE(psis), aux, 2 )
CALL maps_sticks_to_3d( dffts, dtgs, psis, SIZE(psis), aux, 2 ) ! CALL cfft3ds( aux, dfft3d%nr1, dfft3d%nr2, dfft3d%nr3, &
CALL cfft3ds( aux, dfft3d%nr1, dfft3d%nr2, dfft3d%nr3, & ! dfft3d%nr1x,dfft3d%nr2x,dfft3d%nr3x, 1, 1, dfft3d%isind, dfft3d%iplw )
dfft3d%nr1x,dfft3d%nr2x,dfft3d%nr3x, 1, 1, dfft3d%isind, dfft3d%iplw ) ! psis = aux
psis = aux ! ELSE
ELSE
! !
CALL pack_group_sticks( aux, psis, dtgs )
CALL fw_tg_cft3_z( psis, dffts, aux, dtgs )
CALL fw_tg_cft3_scatter( psis, dffts, aux, dtgs )
CALL fw_tg_cft3_xy( psis, dffts, dtgs )
END IF CALL invfft ('tgWave', psis, dffts )
! END IF
#else #else
psis = (0.d0, 0.d0) psis = (0.d0, 0.d0)
CALL c2psi( psis, dffts%nnr, c_bgrp( 1, i ), c_bgrp( 1, i+1 ), ngw, 2 ) CALL c2psi( psis, dffts%nnr, c_bgrp( 1, i ), c_bgrp( 1, i+1 ), ngw, 2 )
...@@ -631,9 +627,7 @@ ...@@ -631,9 +627,7 @@
! !
! Compute the proper factor for each band ! Compute the proper factor for each band
! !
DO ii = 1, dtgs%nogrp ii=dffts%mype2+1
IF( dtgs%nolist( ii ) == me_bgrp ) EXIT
END DO
! !
! Remember two bands are packed in a single array : ! Remember two bands are packed in a single array :
! proc 0 has bands ibnd and ibnd+1 ! proc 0 has bands ibnd and ibnd+1
...@@ -673,11 +667,11 @@ ...@@ -673,11 +667,11 @@
!code this should be equal to the total number of planes !code this should be equal to the total number of planes
! !
ir = dffts%nr1x*dffts%nr2x*dtgs%tg_npp( me_bgrp + 1 ) ir = dffts%nr1x*dffts%nr2x*dffts%my_nr3p
IF( ir > SIZE( psis ) ) & IF( ir > SIZE( psis ) ) &
CALL errore( ' rhoofr ', ' psis size too small ', ir ) CALL errore( ' rhoofr ', ' psis size too small ', ir )
do ir = 1, dffts%nr1x*dffts%nr2x*dtgs%tg_npp( me_bgrp + 1 ) do ir = 1, dffts%nr1x*dffts%nr2x*dffts%my_nr3p
tmp_rhos(ir,iss1) = tmp_rhos(ir,iss1) + sa1*( real(psis(ir)))**2 tmp_rhos(ir,iss1) = tmp_rhos(ir,iss1) + sa1*( real(psis(ir)))**2
tmp_rhos(ir,iss2) = tmp_rhos(ir,iss2) + sa2*(aimag(psis(ir)))**2 tmp_rhos(ir,iss2) = tmp_rhos(ir,iss2) + sa2*(aimag(psis(ir)))**2
end do end do
...@@ -693,24 +687,20 @@ ...@@ -693,24 +687,20 @@
! CALL MPI_REDUCE( rho(1+ioff*nr1*nr2,1), rhos(1,1), dffts%nnr, MPI_DOUBLE_PRECISION, MPI_SUM, ip-1, intra_bgrp_comm, ierr) ! CALL MPI_REDUCE( rho(1+ioff*nr1*nr2,1), rhos(1,1), dffts%nnr, MPI_DOUBLE_PRECISION, MPI_SUM, ip-1, intra_bgrp_comm, ierr)
! ioff = ioff + dffts%npp( ip ) ! ioff = ioff + dffts%npp( ip )
!END DO !END DO
IF ( dtgs%nogrp > 1 ) THEN IF ( dffts%nproc2 > 1 ) CALL mp_sum( tmp_rhos, gid = dffts%comm2 )
CALL mp_sum( tmp_rhos, gid = dtgs%ogrp_comm )
ENDIF
! !
!BRING CHARGE DENSITY BACK TO ITS ORIGINAL POSITION !BRING CHARGE DENSITY BACK TO ITS ORIGINAL POSITION
! !
!If the current processor is not the "first" processor in its !If the current processor is not the "first" processor in its
!orbital group then does a local copy (reshuffling) of its data !orbital group then does a local copy (reshuffling) of its data
! !
from = 1 nxyp = dffts%nr1x * dffts%my_nr2p
DO ii = 1, dtgs%nogrp DO ir3 = 1, dffts%my_nr3p
IF ( dtgs%nolist( ii ) == me_bgrp ) EXIT !Exit the loop ioff = dffts%nr1x * dffts%my_nr2p * (ir3-1)
from = from + dffts%nr1x*dffts%nr2x*dffts%npp( dtgs%nolist( ii ) + 1 )! From where to copy initially ioff_tg = dffts%nr1x * dffts%nr2x * (ir3-1) + dffts%nr1x * dffts%my_i0r2p
ENDDO rhos(ioff+1:ioff+nxyp,1:nspin) = rhos(ioff+1:ioff+nxyp,1:nspin) + tmp_rhos(ioff_tg+1:ioff_tg+nxyp,1:nspin)
! END DO
DO ir = 1, nspin
CALL dcopy( dffts%nr1x*dffts%nr2x*dffts%npp(me_bgrp+1), tmp_rhos(from,ir), 1, rhos(1,ir), 1)
ENDDO
DEALLOCATE( tmp_rhos ) DEALLOCATE( tmp_rhos )
DEALLOCATE( aux ) DEALLOCATE( aux )
...@@ -866,7 +856,7 @@ SUBROUTINE drhov(irb,eigrb,rhovan,drhovan,rhog,rhor,drhog,drhor) ...@@ -866,7 +856,7 @@ SUBROUTINE drhov(irb,eigrb,rhovan,drhovan,rhog,rhor,drhog,drhor)
USE cell_base, ONLY: ainv USE cell_base, ONLY: ainv
USE qgb_mod, ONLY: qgb, dqgb USE qgb_mod, ONLY: qgb, dqgb
USE fft_interfaces, ONLY: fwfft, invfft USE fft_interfaces, ONLY: fwfft, invfft
USE fft_base, ONLY: dfftb, dfftp, dfftb USE fft_base, ONLY: dfftb, dfftp
USE mp_global, ONLY: my_bgrp_id, nbgrp, inter_bgrp_comm USE mp_global, ONLY: my_bgrp_id, nbgrp, inter_bgrp_comm
USE mp, ONLY: mp_sum USE mp, ONLY: mp_sum
...@@ -963,7 +953,7 @@ SUBROUTINE drhov(irb,eigrb,rhovan,drhovan,rhog,rhor,drhog,drhor) ...@@ -963,7 +953,7 @@ SUBROUTINE drhov(irb,eigrb,rhovan,drhovan,rhog,rhor,drhog,drhor)
#if defined(__MPI) #if defined(__MPI)
DO ia=1,na(is) DO ia=1,na(is)
nfft=1 nfft=1
IF ( ( dfftb%np3( isa ) <= 0 ) ) THEN IF ( ( dfftb%np3( isa ) <= 0 ) .OR. ( dfftb%np2( isa ) <= 0 ) ) THEN
isa = isa + nfft isa = isa + nfft
CYCLE CYCLE
END IF END IF
...@@ -1074,7 +1064,7 @@ SUBROUTINE drhov(irb,eigrb,rhovan,drhovan,rhog,rhor,drhog,drhor) ...@@ -1074,7 +1064,7 @@ SUBROUTINE drhov(irb,eigrb,rhovan,drhovan,rhog,rhor,drhog,drhor)
DO is=1,nvb DO is=1,nvb
DO ia=1,na(is) DO ia=1,na(is)
#if defined(__MPI) #if defined(__MPI)
IF ( dfftb%np3( isa ) <= 0 ) go to 25 IF ( ( dfftb%np3( isa ) <= 0 ) .OR. ( dfftb%np2( isa ) <= 0 ) ) go to 25
#endif #endif
DO iss=1,2 DO iss=1,2
dqgbt(:,iss) = (0.d0, 0.d0) dqgbt(:,iss) = (0.d0, 0.d0)
...@@ -1255,16 +1245,13 @@ SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor) ...@@ -1255,16 +1245,13 @@ SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor)
DO is = 1, nvb DO is = 1, nvb
#if defined(__MPI) #if defined(__MPI)
DO ia = 1, na(is) DO ia = 1, na(is)
nfft = 1 nfft = 1
IF ( dfftb%np3( isa ) <= 0 ) THEN IF ( ( dfftb%np3( isa ) <= 0 ) .OR. ( dfftb%np2( isa ) <= 0 ) ) THEN
isa = isa + nfft isa = isa + nfft
CYCLE CYCLE
END IF END IF
#else #else
DO ia = 1, na(is), 2 DO ia = 1, na(is), 2
! !
! nfft=2 if two ffts at the same time are performed ! nfft=2 if two ffts at the same time are performed
...@@ -1301,12 +1288,10 @@ SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor) ...@@ -1301,12 +1288,10 @@ SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor)
qv(:) = (0.d0, 0.d0) qv(:) = (0.d0, 0.d0)
IF(nfft.EQ.2)THEN IF(nfft.EQ.2)THEN
DO ig=1,ngb DO ig=1,ngb
qv(npb(ig))= & qv(npb(ig))= eigrb(ig,isa )*qgbt(ig,1) &
eigrb(ig,isa )*qgbt(ig,1) & + ci * eigrb(ig,isa+1)*qgbt(ig,2)
+ ci* eigrb(ig,isa+1)*qgbt(ig,2) qv(nmb(ig))= CONJG(eigrb(ig,isa )*qgbt(ig,1)) &
qv(nmb(ig))= & + ci * CONJG(eigrb(ig,isa+1)*qgbt(ig,2))
CONJG(eigrb(ig,isa )*qgbt(ig,1)) &
+ ci*CONJG(eigrb(ig,isa+1)*qgbt(ig,2))
END DO END DO
ELSE ELSE
DO ig=1,ngb DO ig=1,ngb
...@@ -1400,7 +1385,7 @@ SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor) ...@@ -1400,7 +1385,7 @@ SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor)
DO is=1,nvb DO is=1,nvb
DO ia=1,na(is) DO ia=1,na(is)
#if defined(__MPI) #if defined(__MPI)
IF ( dfftb%np3( isa ) <= 0 ) go to 25 IF ( ( dfftb%np3( isa ) <= 0 ) .OR. ( dfftb%np2( isa ) <= 0 ) ) go to 25
#endif #endif
DO iss=1,2 DO iss=1,2
qgbt(:,iss) = (0.d0, 0.d0) qgbt(:,iss) = (0.d0, 0.d0)
......
...@@ -781,7 +781,7 @@ subroutine formf( tfirst, eself ) ...@@ -781,7 +781,7 @@ subroutine formf( tfirst, eself )
rhopsum = SUM( rhops( 1:ngms, is ) ) rhopsum = SUM( rhops( 1:ngms, is ) )
call mp_sum( vpsum, intra_bgrp_comm ) call mp_sum( vpsum, intra_bgrp_comm )
call mp_sum( rhopsum, intra_bgrp_comm ) call mp_sum( rhopsum, intra_bgrp_comm )
WRITE( stdout,1250) vps(1,is),rhops(1,is) WRITE( stdout,1250) (vps(ig,is),rhops(ig,is),ig=1,5)
WRITE( stdout,1300) vpsum,rhopsum WRITE( stdout,1300) vpsum,rhopsum
endif endif
! !
......
...@@ -210,6 +210,7 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out ) ...@@ -210,6 +210,7 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
tfile = ( MOD( nfi, iprint ) == 0 ) tfile = ( MOD( nfi, iprint ) == 0 )
tstdout = ( MOD( nfi, iprint_stdout ) == 0 ) .OR. tlast tstdout = ( MOD( nfi, iprint_stdout ) == 0 ) .OR. tlast
! !
IF ( abivol ) THEN IF ( abivol ) THEN
IF ( pvar ) THEN IF ( pvar ) THEN
IF ( nfi .EQ. 1 ) THEN IF ( nfi .EQ. 1 ) THEN
...@@ -369,6 +370,7 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out ) ...@@ -369,6 +370,7 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
! !
END IF END IF
! !
!
!======================================================================= !=======================================================================
! !
! verlet algorithm ! verlet algorithm
......
...@@ -25,6 +25,7 @@ SUBROUTINE cpr_loop( nloop ) ...@@ -25,6 +25,7 @@ SUBROUTINE cpr_loop( nloop )
REAL(DP) :: etot REAL(DP) :: etot
! !
! !
IF ( nat > 0 ) THEN IF ( nat > 0 ) THEN
! !
ALLOCATE( tau( 3, nat ) ) ALLOCATE( tau( 3, nat ) )
......
...@@ -10,7 +10,7 @@ SUBROUTINE exx_gs(nfi, c) ...@@ -10,7 +10,7 @@ SUBROUTINE exx_gs(nfi, c)
! !
USE kinds, ONLY : DP USE kinds, ONLY : DP
USE constants, ONLY : fpi USE constants, ONLY : fpi
USE fft_base, ONLY : dffts, dfftp, dtgs USE fft_base, ONLY : dffts, dfftp
USE mp, ONLY : mp_barrier, mp_sum USE mp, ONLY : mp_barrier, mp_sum
USE mp_global, ONLY : nproc_image, me_image, root_image, intra_image_comm, intra_bgrp_comm, me_bgrp USE mp_global, ONLY : nproc_image, me_image, root_image, intra_image_comm, intra_bgrp_comm, me_bgrp
USE parallel_include USE parallel_include
...@@ -886,7 +886,7 @@ SUBROUTINE exx_gs(nfi, c) ...@@ -886,7 +886,7 @@ SUBROUTINE exx_gs(nfi, c)
! !
!-----------Zhaofeng's vpsil (local) to exx_potential (global) ----------- !-----------Zhaofeng's vpsil (local) to exx_potential (global) -----------
! !
nogrp = dtgs%nogrp nogrp = dffts%nproc2
! !
ALLOCATE( sdispls(nproc_image), sendcount(nproc_image) ); sdispls=0; sendcount=0 ALLOCATE( sdispls(nproc_image), sendcount(nproc_image) ); sdispls=0; sendcount=0
ALLOCATE( rdispls(nproc_image), recvcount(nproc_image) ); rdispls=0; recvcount=0 ALLOCATE( rdispls(nproc_image), recvcount(nproc_image) ); rdispls=0; recvcount=0
...@@ -961,10 +961,10 @@ SUBROUTINE exx_gs(nfi, c) ...@@ -961,10 +961,10 @@ SUBROUTINE exx_gs(nfi, c)
#if defined(__MPI) #if defined(__MPI)
! !
DO ir=1,nproc_image/nogrp DO ir=1,nproc_image/nogrp
CALL mp_barrier( dtgs%ogrp_comm ) CALL mp_barrier( dffts%comm2 )
CALL MPI_ALLTOALLV(exx_tmp3(1,ir), sendcount1, sdispls1, MPI_DOUBLE_PRECISION, & CALL MPI_ALLTOALLV(exx_tmp3(1,ir), sendcount1, sdispls1, MPI_DOUBLE_PRECISION, &
& exx_potential(1,ir),recvcount1, rdispls1, MPI_DOUBLE_PRECISION, & & exx_potential(1,ir),recvcount1, rdispls1, MPI_DOUBLE_PRECISION, &
& dtgs%ogrp_comm, ierr) & dffts%comm2, ierr)
END DO END DO
#endif #endif
! !
......
...@@ -36,7 +36,6 @@ MODULE exx_module ...@@ -36,7 +36,6 @@ MODULE exx_module
USE electrons_base, ONLY: nupdwn !number of states with up and down spin USE electrons_base, ONLY: nupdwn !number of states with up and down spin
USE fft_base, ONLY: dffts !FFT derived data type USE fft_base, ONLY: dffts !FFT derived data type
USE fft_base, ONLY: dfftp !FFT derived data type USE fft_base, ONLY: dfftp !FFT derived data type
USE fft_base, ONLY: dtgs !FFT task groups
USE funct, ONLY: get_exx_fraction ! function to get exx_fraction value USE funct, ONLY: get_exx_fraction ! function to get exx_fraction value
USE funct, ONLY: stop_exx, start_exx USE funct, ONLY: stop_exx, start_exx
USE input_parameters, ONLY: ref_alat !alat of reference cell .. USE input_parameters, ONLY: ref_alat !alat of reference cell ..
...@@ -167,7 +166,7 @@ CONTAINS ...@@ -167,7 +166,7 @@ CONTAINS
#if defined(__OPENMP) #if defined(__OPENMP)
WRITE(stdout,'(5X,"OpenMP threads/MPI task",3X,I4)') omp_get_max_threads() WRITE(stdout,'(5X,"OpenMP threads/MPI task",3X,I4)') omp_get_max_threads()
#endif #endif
WRITE(stdout,'(5X,"Taskgroups ",3X,I7)') dtgs%nogrp WRITE(stdout,'(5X,"Taskgroups ",3X,I7)') dffts%nproc2
! !
! the fraction of exact exchange is stored here ! the fraction of exact exchange is stored here
! !
...@@ -269,11 +268,11 @@ CONTAINS ...@@ -269,11 +268,11 @@ CONTAINS
! !
END IF END IF
! !
IF((nproc_image.LE.nbsp).AND.(dtgs%nogrp.GT.1)) CALL errore('exx_module','EXX calculation error : & IF((nproc_image.LE.nbsp).AND.(dffts%nproc2.GT.1)) CALL errore('exx_module','EXX calculation error : &
& use taskgroup (-ntg) = 1 when number of MPI tasks is less or equal to the number of electronic states',1) & use taskgroup (-ntg) = 1 when number of MPI tasks is less or equal to the number of electronic states',1)
! !
! to fix this issue. see file exx_psi.f90, exx_gs.f90 ! to fix this issue. see file exx_psi.f90, exx_gs.f90
IF(nproc_image.GT.nbsp.AND.MOD(dffts%nnr,dtgs%nogrp).NE.0) CALL errore('exx_module','EXX calculation error : & IF(nproc_image.GT.nbsp.AND.MOD(dffts%nnr,dffts%nproc2).NE.0) CALL errore('exx_module','EXX calculation error : &
& (nr1x * nr2x) is not integer multiple of the number of task groups. Change task groups such that & & (nr1x * nr2x) is not integer multiple of the number of task groups. Change task groups such that &
& (nr1x * nr2x) becomes integer multiple of the number of task groups. Otherwise restrict number of MPI tasks & & (nr1x * nr2x) becomes integer multiple of the number of task groups. Otherwise restrict number of MPI tasks &
& up to the number of electronic states.',1) & up to the number of electronic states.',1)
...@@ -284,7 +283,7 @@ CONTAINS ...@@ -284,7 +283,7 @@ CONTAINS
& or equal to the electronic bands. Otherwise, change ecutwfc to make (nr1x * nr2x) an even number.',1) & or equal to the electronic bands. Otherwise, change ecutwfc to make (nr1x * nr2x) an even number.',1)
! !
! to fix this issue. see file exx_psi.f90, exx_gs.f90 ! to fix this issue. see file exx_psi.f90, exx_gs.f90
IF((nproc_image.GT.nbsp).AND.MOD(nbsp,2*dtgs%nogrp).NE.0) CALL errore('exx_module','EXX calculation error : & IF((nproc_image.GT.nbsp).AND.MOD(nbsp,2*dffts%nproc2).NE.0) CALL errore('exx_module','EXX calculation error : &
& number of electronic states is not integer multiple of two times the number of task groups. & & number of electronic states is not integer multiple of two times the number of task groups. &
& Either change the number of taskgroups or restrict number of MPI tasks up to the number of electronic states.',1) & Either change the number of taskgroups or restrict number of MPI tasks up to the number of electronic states.',1)
! !
...@@ -302,7 +301,7 @@ CONTAINS ...@@ -302,7 +301,7 @@ CONTAINS
write(stdout,*) "You may want to use number of MPI tasks = ", CEILING(DBLE(2.0*dfftp%nr3)/DBLE(nbsp))*nbsp,& write(stdout,*) "You may want to use number of MPI tasks = ", CEILING(DBLE(2.0*dfftp%nr3)/DBLE(nbsp))*nbsp,&
& "(combined with -ntg 2)" & "(combined with -ntg 2)"
! !
ELSE IF (NINT(2**(LOG(DBLE(INT(nproc_image / dfftp%nr3))) / LOG(2.0))).GT.dtgs%nogrp) THEN ELSE IF (NINT(2**(LOG(DBLE(INT(nproc_image / dfftp%nr3))) / LOG(2.0))).GT.dffts%nproc2) THEN
! !
write(stdout,*) write(stdout,*)
write(stdout,*) "**********************************************************************************************" write(stdout,*) "**********************************************************************************************"
...@@ -312,7 +311,7 @@ CONTAINS ...@@ -312,7 +311,7 @@ CONTAINS
NINT(2**(LOG(DBLE(INT(nproc_image / dfftp%nr3))) / LOG(2.0))) NINT(2**(LOG(DBLE(INT(nproc_image / dfftp%nr3))) / LOG(2.0)))
END IF END IF
! !
IF(dtgs%nogrp.EQ.1) THEN IF(dffts%nproc2.EQ.1) THEN
! !
write(stdout,*) write(stdout,*)
write(stdout,*) "**********************************************************************************************" write(stdout,*) "**********************************************************************************************"
...@@ -329,9 +328,9 @@ CONTAINS ...@@ -329,9 +328,9 @@ CONTAINS
& One needs number of task groups = 2^n where n is a positive integer when number of MPI tasks is greater than & & One needs number of task groups = 2^n where n is a positive integer when number of MPI tasks is greater than &
& the number of electronic states. See above for Possible Solutions',1) & the number of electronic states. See above for Possible Solutions',1)
! !
ELSE IF (NINT(2**(LOG(DBLE(dtgs%nogrp)) / LOG(2.0))).NE.dtgs%nogrp) THEN ELSE IF (NINT(2**(LOG(DBLE(dffts%nproc2)) / LOG(2.0))).NE.dffts%nproc2) THEN
! !
! NINT(2**(LOG(DBLE(dtgs%nogrp)) / LOG(2.0))) is the largest power of 2 that is smaller or equal to dffts%nogrp ! NINT(2**(LOG(DBLE(dffts%nproc2)) / LOG(2.0))) is the largest power of 2 that is smaller or equal to dffts%nogrp
! !
CALL errore('exx_module','EXX calculation error : & CALL errore('exx_module','EXX calculation error : &
& One needs number of task groups = 2^n where n is a positive integer when number of MPI tasks is greater than & & One needs number of task groups = 2^n where n is a positive integer when number of MPI tasks is greater than &
...@@ -340,7 +339,7 @@ CONTAINS ...@@ -340,7 +339,7 @@ CONTAINS
! !
END IF END IF
! !
IF((dtgs%nogrp.GT.1).AND.(dfftp%nr3*dtgs%nogrp.GT.nproc_image)) CALL errore('exx_module','EXX calculation error : & IF((dffts%nproc2.GT.1).AND.(dfftp%nr3*dffts%nproc2.GT.nproc_image)) CALL errore('exx_module','EXX calculation error : &
& (nr3x * number of taskgroups) is greater than the number of MPI tasks. Change the number of MPI tasks or the number & & (nr3x * number of taskgroups) is greater than the number of MPI tasks. Change the number of MPI tasks or the number &
& of taskgroups or both. To estimate ntg, find the value of nr3x in the output and compute (MPI task/nr3x) and take & & of taskgroups or both. To estimate ntg, find the value of nr3x in the output and compute (MPI task/nr3x) and take &
& the integer value.',1) & the integer value.',1)
...@@ -377,21 +376,21 @@ CONTAINS ...@@ -377,21 +376,21 @@ CONTAINS
! !
IF (nproc_image .LT. nbsp) THEN IF (nproc_image .LT. nbsp) THEN
! !
ALLOCATE( exx_potential(dffts%nr1*dffts%nr2*dffts%npp(me_bgrp+1),nbsp) ) ALLOCATE( exx_potential(dffts%nr1*dffts%nr2*dffts%my_nr3p,nbsp) )
! !
ELSE ELSE
! !
IF ( dtgs%have_task_groups ) THEN IF ( dffts%have_task_groups ) THEN
! !
ALLOCATE( exx_potential(dffts%nnr,nproc_image/dtgs%nogrp) ) ALLOCATE( exx_potential(dffts%nnr,nproc_image/dffts%nproc2) )
! !
IF(MOD(nproc_image,dtgs%nogrp).NE.0) CALL errore & IF(MOD(nproc_image,dffts%nproc2).NE.0) CALL errore &
& ('exx_module','EXX calculation is not working when & & ('exx_module','EXX calculation is not working when &
& number of MPI tasks (nproc_image) is not integer multiple of number of taskgroups',1) & number of MPI tasks (nproc_image) is not integer multiple of number of taskgroups',1)
! !
ELSE ELSE
! !
ALLOCATE( exx_potential(dffts%nr1x*dffts%nr2x*dffts%npp(me_bgrp+1),nproc_image) ) ! ALLOCATE( exx_potential(dffts%nr1x*dffts%nr2x*dffts%my_nr3p,nproc_image) ) !
! !
END IF END IF
! !
...@@ -491,7 +490,7 @@ CONTAINS ...@@ -491,7 +490,7 @@ CONTAINS
! !
! ** Note that in this case .. ! ** Note that in this case ..
! this is incorrect: my_nxyz(:) = nr1*nr2*dffts%npp(me_image+1) ! this is incorrect: my_nxyz(:) = nr1*nr2*dffts%npp(me_image+1)
my_nxyz(:) = nr1*nr2*dffts%npp my_nxyz(:) = nr1*nr2*dffts%nr3p
! !
!DEBUG !DEBUG
!WRITE(stdout,'("my_nbsp")') !WRITE(stdout,'("my_nbsp")')
......
...@@ -8,7 +8,7 @@ SUBROUTINE exx_psi(c, psitot2,nnrtot,my_nbsp, my_nxyz, nbsp) ...@@ -8,7 +8,7 @@ SUBROUTINE exx_psi(c, psitot2,nnrtot,my_nbsp, my_nxyz, nbsp)
! !
USE kinds, ONLY : DP USE kinds, ONLY : DP
USE fft_interfaces, ONLY : invfft USE fft_interfaces, ONLY : invfft
USE fft_base, ONLY : dffts, dfftp, dtgs USE fft_base, ONLY : dffts, dfftp
USE gvecw, ONLY : ngw USE gvecw, ONLY : ngw
USE mp_global, ONLY : nproc_image, me_image,intra_image_comm USE mp_global, ONLY : nproc_image, me_image,intra_image_comm
USE cell_base, ONLY : omega USE cell_base, ONLY : omega
...@@ -47,9 +47,9 @@ SUBROUTINE exx_psi(c, psitot2,nnrtot,my_nbsp, my_nxyz, nbsp) ...@@ -47,9 +47,9 @@ SUBROUTINE exx_psi(c, psitot2,nnrtot,my_nbsp, my_nxyz, nbsp)
nr1s=dfftp%nr1; nr2s=dfftp%nr2; nr3s=dfftp%nr3 nr1s=dfftp%nr1; nr2s=dfftp%nr2; nr3s=dfftp%nr3