Commit 3e6b4f8e authored by degironc's avatar degironc

MAJOR restructuring of the FFTXlib library

In real space processors are organized in a 2D pattern.

Each processor owns data from a sub-set of Z-planes and a sub-set of Y-planes.
In reciprocal space each processor owns Z-columns that belong to a sub set of
X-values. This allows to split the processors in two sets for communication
in the YZ and XY planes.
In alternative, if the situation allows for it, a task group paralelization is used
(with ntg=nyfft) where complete XY planes of ntg wavefunctions are collected and Fourier
trasnformed in G space by different task-groups. This is preferable to the Z-proc + Y-proc
paralleization if task group can be used because a smaller number of larger ammounts of 
data are transferred. Hence three types of fft are implemented: 
 
  !
  !! ... isgn = +-1 : parallel 3d fft for rho and for the potential
  !
  !! ... isgn = +-2 : parallel 3d fft for wavefunctions
  !
  !! ... isgn = +-3 : parallel 3d fft for wavefunctions with task group
  !
  !! ... isgn = +   : G-space to R-space, output = \sum_G f(G)exp(+iG*R)
  !! ...              fft along z using pencils        (cft_1z)
  !! ...              transpose across nodes           (fft_scatter_yz)
  !! ...              fft along y using pencils        (cft_1y)
  !! ...              transpose across nodes           (fft_scatter_xy)
  !! ...              fft along x using pencils        (cft_1x)
  !
  !! ... isgn = -   : R-space to G-space, output = \int_R f(R)exp(-iG*R)/Omega
  !! ...              fft along x using pencils        (cft_1x)
  !! ...              transpose across nodes           (fft_scatter_xy)
  !! ...              fft along y using pencils        (cft_1y)
  !! ...              transpose across nodes           (fft_scatter_yz)
  !! ...              fft along z using pencils        (cft_1z)
  !
  ! If task_group_fft_is_active the FFT acts on a number of wfcs equal to 
  ! dfft%nproc2, the number of Y-sections in which a plane is divided. 
  ! Data are reshuffled by the fft_scatter_tg routine so that each of the 
  ! dfft%nproc2 subgroups (made by dfft%nproc3 procs) deals with whole planes 
  ! of a single wavefunciton.
  !

fft_type module heavily modified, a number of variables renamed with more intuitive names 
(at least to me), a number of more variables introduced for the Y-proc parallelization.

Task_group module made void. task_group management is now reduced to the logical component
 fft_desc%have_task_groups of fft_type_descriptor type variable fft_desc.

In term of interfaces, the 'easy' calling sequences are

SUBROUTINE invfft/fwfft( grid_type, f, dfft, howmany )

  !! where:
  !! 
  !! **grid_type = 'Dense'** : 
  !!   inverse/direct fourier transform of potentials and charge density f
  !!   on the dense grid (dfftp). On output, f is overwritten
  !! 
  !! **grid_type = 'Smooth'** :
  !!   inverse/direct fourier transform of  potentials and charge density f
  !!   on the smooth grid (dffts). On output, f is overwritten
  !! 
  !! **grid_type = 'Wave'** :
  !!   inverse/direct fourier transform of  wave functions f
  !!   on the smooth grid (dffts). On output, f is overwritten
  !!
  !! **grid_type = 'tgWave'** :
  !!   inverse/direct fourier transform of  wave functions f with task group
  !!   on the smooth grid (dffts). On output, f is overwritten
  !!
  !! **grid_type = 'Custom'** : 
  !!   inverse/direct fourier transform of potentials and charge density f
  !!   on a custom grid (dfft_exx). On output, f is overwritten
  !! 
  !! **grid_type = 'CustomWave'** :
  !!   inverse/direct fourier transform of  wave functions f
  !!   on a custom grid (dfft_exx). On output, f is overwritten
  !! 
  !! **dfft = FFT descriptor**, IMPORTANT NOTICE: grid is specified only by dfft.
  !!   No check is performed on the correspondence between dfft and grid_type.
  !!   grid_type is now used only to distinguish cases 'Wave' / 'CustomWave' 
  !!   from all other cases
                                                                                                 

Many more files modified.




git-svn-id: http://qeforge.qe-forge.org/svn/q-e/trunk/espresso@13676 c92efa57-630b-4861-b058-cf58834340f0
parent 9cd88c5e
......@@ -97,6 +97,7 @@ cat > h2o.scf.in << EOF
ekin_conv_thr = 1.0d-8
etot_conv_thr = 1.0d-6
forc_conv_thr = 1.0d-2
verbosity="debug"
/
&system
ibrav= 1, celldm(1)=16.0, nat= 3, ntyp= 2,
......
......@@ -117,7 +117,7 @@
USE cg_module, ONLY: tcg
USE cp_interfaces, ONLY: stress_kin, enkin
USE fft_interfaces, ONLY: fwfft, invfft
USE fft_base, ONLY: dffts, dfftp, dfft3d, dtgs
USE fft_base, ONLY: dffts, dfftp, dfft3d
USE cp_interfaces, ONLY: checkrho, ennl, calrhovan, dennl
USE cp_main_variables, ONLY: iprint_stdout, descla
USE wannier_base, ONLY: iwf
......@@ -152,6 +152,7 @@
! local variables
INTEGER :: iss, isup, isdw, iss1, iss2, ios, i, ir, ig, k
INTEGER :: ioff, ioff_tg, nxyp, ir3
REAL(DP) :: rsumr(2), rsumg(2), sa1, sa2, detmp(6), mtmp(3,3)
REAL(DP) :: rnegsum, rmin, rmax, rsum
COMPLEX(DP) :: ci,fp,fm
......@@ -348,6 +349,7 @@
END DO
!$omp end parallel do
CALL fwfft('Smooth', psis, dffts )
!$omp parallel do
DO ig=1,ngms
rhog(ig,iss)=psis(nls(ig))
......@@ -512,9 +514,8 @@
SUBROUTINE loop_over_states
!
USE parallel_include
USE fft_parallel, ONLY: pack_group_sticks, fw_tg_cft3_z, fw_tg_cft3_scatter, fw_tg_cft3_xy
USE fft_scalar, ONLY: cfft3ds
USE scatter_mod, ONLY: maps_sticks_to_3d
! USE fft_scalar, ONLY: cfft3ds
! USE scatter_mod, ONLY: maps_sticks_to_3d
!
! MAIN LOOP OVER THE EIGENSTATES
! - This loop is also parallelized within the task-groups framework
......@@ -532,15 +533,15 @@
REAL(DP), ALLOCATABLE :: tmp_rhos(:,:)
COMPLEX(DP), ALLOCATABLE :: aux(:)
ALLOCATE( psis( dtgs%tg_nnr * dtgs%nogrp ) )
ALLOCATE( aux( dtgs%tg_nnr * dtgs%nogrp ) )
ALLOCATE( psis( dffts%nnr_tg ) )
ALLOCATE( aux( dffts%nnr_tg ) )
!
ALLOCATE( tmp_rhos ( dffts%nr1x * dffts%nr2x * dtgs%tg_npp( me_bgrp + 1 ), nspin ) )
ALLOCATE( tmp_rhos ( dffts%nr1x * dffts%nr2x * dffts%my_nr3p, nspin ) )
!
tmp_rhos = 0_DP
do i = 1, nbsp_bgrp, 2*dffts%nproc2
do i = 1, nbsp_bgrp, 2*dtgs%nogrp
!
! Initialize wave-functions in Fourier space (to be FFTed)
! The size of psis is nnr: which is equal to the total number
......@@ -566,7 +567,7 @@
!!$omp parallel
!!$omp single
do eig_index = 1, 2*dtgs%nogrp, 2
do eig_index = 1, 2*dffts%nproc2, 2
!
!!$omp task default(none) &
!!$omp firstprivate( i, eig_offset, nbsp_bgrp, ngw, eig_index ) &
......@@ -580,8 +581,8 @@
!
! The eig_index loop is executed only ONCE when NOGRP=1.
!
CALL c2psi( aux(eig_offset*dtgs%tg_nnr+1), dtgs%tg_nnr, &
c_bgrp( 1, i+eig_index-1 ), c_bgrp( 1, i+eig_index ), ngw, 2 )
CALL c2psi( psis(eig_offset*dffts%nnr+1), dffts%nnr, &
c_bgrp( 1, i+eig_index-1 ), c_bgrp( 1, i+eig_index ), ngw, 2 )
!
ENDIF
!!$omp end task
......@@ -599,23 +600,18 @@
!
! now redistribute data
!
!
IF( dtgs%nogrp == dtgs%nproc ) THEN
CALL pack_group_sticks( aux, psis, dtgs )
CALL maps_sticks_to_3d( dffts, dtgs, psis, SIZE(psis), aux, 2 )
CALL cfft3ds( aux, dfft3d%nr1, dfft3d%nr2, dfft3d%nr3, &
dfft3d%nr1x,dfft3d%nr2x,dfft3d%nr3x, 1, 1, dfft3d%isind, dfft3d%iplw )
psis = aux
ELSE
! IF( dffts%nproc2 == dffts%nproc ) THEN
! CALL pack_group_sticks( aux, psis )
! CALL maps_sticks_to_3d( dffts, psis, SIZE(psis), aux, 2 )
! CALL cfft3ds( aux, dfft3d%nr1, dfft3d%nr2, dfft3d%nr3, &
! dfft3d%nr1x,dfft3d%nr2x,dfft3d%nr3x, 1, 1, dfft3d%isind, dfft3d%iplw )
! psis = aux
! ELSE
!
CALL pack_group_sticks( aux, psis, dtgs )
CALL fw_tg_cft3_z( psis, dffts, aux, dtgs )
CALL fw_tg_cft3_scatter( psis, dffts, aux, dtgs )
CALL fw_tg_cft3_xy( psis, dffts, dtgs )
END IF
CALL invfft ('tgWave', psis, dffts )
! END IF
#else
psis = (0.d0, 0.d0)
CALL c2psi( psis, dffts%nnr, c_bgrp( 1, i ), c_bgrp( 1, i+1 ), ngw, 2 )
......@@ -631,9 +627,7 @@
!
! Compute the proper factor for each band
!
DO ii = 1, dtgs%nogrp
IF( dtgs%nolist( ii ) == me_bgrp ) EXIT
END DO
ii=dffts%mype2+1
!
! Remember two bands are packed in a single array :
! proc 0 has bands ibnd and ibnd+1
......@@ -673,11 +667,11 @@
!code this should be equal to the total number of planes
!
ir = dffts%nr1x*dffts%nr2x*dtgs%tg_npp( me_bgrp + 1 )
ir = dffts%nr1x*dffts%nr2x*dffts%my_nr3p
IF( ir > SIZE( psis ) ) &
CALL errore( ' rhoofr ', ' psis size too small ', ir )
do ir = 1, dffts%nr1x*dffts%nr2x*dtgs%tg_npp( me_bgrp + 1 )
do ir = 1, dffts%nr1x*dffts%nr2x*dffts%my_nr3p
tmp_rhos(ir,iss1) = tmp_rhos(ir,iss1) + sa1*( real(psis(ir)))**2
tmp_rhos(ir,iss2) = tmp_rhos(ir,iss2) + sa2*(aimag(psis(ir)))**2
end do
......@@ -693,24 +687,20 @@
! CALL MPI_REDUCE( rho(1+ioff*nr1*nr2,1), rhos(1,1), dffts%nnr, MPI_DOUBLE_PRECISION, MPI_SUM, ip-1, intra_bgrp_comm, ierr)
! ioff = ioff + dffts%npp( ip )
!END DO
IF ( dtgs%nogrp > 1 ) THEN
CALL mp_sum( tmp_rhos, gid = dtgs%ogrp_comm )
ENDIF
IF ( dffts%nproc2 > 1 ) CALL mp_sum( tmp_rhos, gid = dffts%comm2 )
!
!BRING CHARGE DENSITY BACK TO ITS ORIGINAL POSITION
!
!If the current processor is not the "first" processor in its
!orbital group then does a local copy (reshuffling) of its data
!
from = 1
DO ii = 1, dtgs%nogrp
IF ( dtgs%nolist( ii ) == me_bgrp ) EXIT !Exit the loop
from = from + dffts%nr1x*dffts%nr2x*dffts%npp( dtgs%nolist( ii ) + 1 )! From where to copy initially
ENDDO
!
DO ir = 1, nspin
CALL dcopy( dffts%nr1x*dffts%nr2x*dffts%npp(me_bgrp+1), tmp_rhos(from,ir), 1, rhos(1,ir), 1)
ENDDO
nxyp = dffts%nr1x * dffts%my_nr2p
DO ir3 = 1, dffts%my_nr3p
ioff = dffts%nr1x * dffts%my_nr2p * (ir3-1)
ioff_tg = dffts%nr1x * dffts%nr2x * (ir3-1) + dffts%nr1x * dffts%my_i0r2p
rhos(ioff+1:ioff+nxyp,1:nspin) = rhos(ioff+1:ioff+nxyp,1:nspin) + tmp_rhos(ioff_tg+1:ioff_tg+nxyp,1:nspin)
END DO
DEALLOCATE( tmp_rhos )
DEALLOCATE( aux )
......@@ -866,7 +856,7 @@ SUBROUTINE drhov(irb,eigrb,rhovan,drhovan,rhog,rhor,drhog,drhor)
USE cell_base, ONLY: ainv
USE qgb_mod, ONLY: qgb, dqgb
USE fft_interfaces, ONLY: fwfft, invfft
USE fft_base, ONLY: dfftb, dfftp, dfftb
USE fft_base, ONLY: dfftb, dfftp
USE mp_global, ONLY: my_bgrp_id, nbgrp, inter_bgrp_comm
USE mp, ONLY: mp_sum
......@@ -963,7 +953,7 @@ SUBROUTINE drhov(irb,eigrb,rhovan,drhovan,rhog,rhor,drhog,drhor)
#if defined(__MPI)
DO ia=1,na(is)
nfft=1
IF ( ( dfftb%np3( isa ) <= 0 ) ) THEN
IF ( ( dfftb%np3( isa ) <= 0 ) .OR. ( dfftb%np2( isa ) <= 0 ) ) THEN
isa = isa + nfft
CYCLE
END IF
......@@ -1074,7 +1064,7 @@ SUBROUTINE drhov(irb,eigrb,rhovan,drhovan,rhog,rhor,drhog,drhor)
DO is=1,nvb
DO ia=1,na(is)
#if defined(__MPI)
IF ( dfftb%np3( isa ) <= 0 ) go to 25
IF ( ( dfftb%np3( isa ) <= 0 ) .OR. ( dfftb%np2( isa ) <= 0 ) ) go to 25
#endif
DO iss=1,2
dqgbt(:,iss) = (0.d0, 0.d0)
......@@ -1255,16 +1245,13 @@ SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor)
DO is = 1, nvb
#if defined(__MPI)
DO ia = 1, na(is)
nfft = 1
IF ( dfftb%np3( isa ) <= 0 ) THEN
IF ( ( dfftb%np3( isa ) <= 0 ) .OR. ( dfftb%np2( isa ) <= 0 ) ) THEN
isa = isa + nfft
CYCLE
END IF
#else
DO ia = 1, na(is), 2
!
! nfft=2 if two ffts at the same time are performed
......@@ -1301,12 +1288,10 @@ SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor)
qv(:) = (0.d0, 0.d0)
IF(nfft.EQ.2)THEN
DO ig=1,ngb
qv(npb(ig))= &
eigrb(ig,isa )*qgbt(ig,1) &
+ ci* eigrb(ig,isa+1)*qgbt(ig,2)
qv(nmb(ig))= &
CONJG(eigrb(ig,isa )*qgbt(ig,1)) &
+ ci*CONJG(eigrb(ig,isa+1)*qgbt(ig,2))
qv(npb(ig))= eigrb(ig,isa )*qgbt(ig,1) &
+ ci * eigrb(ig,isa+1)*qgbt(ig,2)
qv(nmb(ig))= CONJG(eigrb(ig,isa )*qgbt(ig,1)) &
+ ci * CONJG(eigrb(ig,isa+1)*qgbt(ig,2))
END DO
ELSE
DO ig=1,ngb
......@@ -1400,7 +1385,7 @@ SUBROUTINE rhov(irb,eigrb,rhovan,rhog,rhor)
DO is=1,nvb
DO ia=1,na(is)
#if defined(__MPI)
IF ( dfftb%np3( isa ) <= 0 ) go to 25
IF ( ( dfftb%np3( isa ) <= 0 ) .OR. ( dfftb%np2( isa ) <= 0 ) ) go to 25
#endif
DO iss=1,2
qgbt(:,iss) = (0.d0, 0.d0)
......
......@@ -781,7 +781,7 @@ subroutine formf( tfirst, eself )
rhopsum = SUM( rhops( 1:ngms, is ) )
call mp_sum( vpsum, intra_bgrp_comm )
call mp_sum( rhopsum, intra_bgrp_comm )
WRITE( stdout,1250) vps(1,is),rhops(1,is)
WRITE( stdout,1250) (vps(ig,is),rhops(ig,is),ig=1,5)
WRITE( stdout,1300) vpsum,rhopsum
endif
!
......
......@@ -210,6 +210,7 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
tfile = ( MOD( nfi, iprint ) == 0 )
tstdout = ( MOD( nfi, iprint_stdout ) == 0 ) .OR. tlast
!
IF ( abivol ) THEN
IF ( pvar ) THEN
IF ( nfi .EQ. 1 ) THEN
......@@ -369,6 +370,7 @@ SUBROUTINE cprmain( tau_out, fion_out, etot_out )
!
END IF
!
!
!=======================================================================
!
! verlet algorithm
......
......@@ -25,6 +25,7 @@ SUBROUTINE cpr_loop( nloop )
REAL(DP) :: etot
!
!
IF ( nat > 0 ) THEN
!
ALLOCATE( tau( 3, nat ) )
......
......@@ -10,7 +10,7 @@ SUBROUTINE exx_gs(nfi, c)
!
USE kinds, ONLY : DP
USE constants, ONLY : fpi
USE fft_base, ONLY : dffts, dfftp, dtgs
USE fft_base, ONLY : dffts, dfftp
USE mp, ONLY : mp_barrier, mp_sum
USE mp_global, ONLY : nproc_image, me_image, root_image, intra_image_comm, intra_bgrp_comm, me_bgrp
USE parallel_include
......@@ -886,7 +886,7 @@ SUBROUTINE exx_gs(nfi, c)
!
!-----------Zhaofeng's vpsil (local) to exx_potential (global) -----------
!
nogrp = dtgs%nogrp
nogrp = dffts%nproc2
!
ALLOCATE( sdispls(nproc_image), sendcount(nproc_image) ); sdispls=0; sendcount=0
ALLOCATE( rdispls(nproc_image), recvcount(nproc_image) ); rdispls=0; recvcount=0
......@@ -961,10 +961,10 @@ SUBROUTINE exx_gs(nfi, c)
#if defined(__MPI)
!
DO ir=1,nproc_image/nogrp
CALL mp_barrier( dtgs%ogrp_comm )
CALL mp_barrier( dffts%comm2 )
CALL MPI_ALLTOALLV(exx_tmp3(1,ir), sendcount1, sdispls1, MPI_DOUBLE_PRECISION, &
& exx_potential(1,ir),recvcount1, rdispls1, MPI_DOUBLE_PRECISION, &
& dtgs%ogrp_comm, ierr)
& dffts%comm2, ierr)
END DO
#endif
!
......
......@@ -36,7 +36,6 @@ MODULE exx_module
USE electrons_base, ONLY: nupdwn !number of states with up and down spin
USE fft_base, ONLY: dffts !FFT derived data type
USE fft_base, ONLY: dfftp !FFT derived data type
USE fft_base, ONLY: dtgs !FFT task groups
USE funct, ONLY: get_exx_fraction ! function to get exx_fraction value
USE funct, ONLY: stop_exx, start_exx
USE input_parameters, ONLY: ref_alat !alat of reference cell ..
......@@ -167,7 +166,7 @@ CONTAINS
#if defined(__OPENMP)
WRITE(stdout,'(5X,"OpenMP threads/MPI task",3X,I4)') omp_get_max_threads()
#endif
WRITE(stdout,'(5X,"Taskgroups ",3X,I7)') dtgs%nogrp
WRITE(stdout,'(5X,"Taskgroups ",3X,I7)') dffts%nproc2
!
! the fraction of exact exchange is stored here
!
......@@ -269,11 +268,11 @@ CONTAINS
!
END IF
!
IF((nproc_image.LE.nbsp).AND.(dtgs%nogrp.GT.1)) CALL errore('exx_module','EXX calculation error : &
IF((nproc_image.LE.nbsp).AND.(dffts%nproc2.GT.1)) CALL errore('exx_module','EXX calculation error : &
& use taskgroup (-ntg) = 1 when number of MPI tasks is less or equal to the number of electronic states',1)
!
! to fix this issue. see file exx_psi.f90, exx_gs.f90
IF(nproc_image.GT.nbsp.AND.MOD(dffts%nnr,dtgs%nogrp).NE.0) CALL errore('exx_module','EXX calculation error : &
IF(nproc_image.GT.nbsp.AND.MOD(dffts%nnr,dffts%nproc2).NE.0) CALL errore('exx_module','EXX calculation error : &
& (nr1x * nr2x) is not integer multiple of the number of task groups. Change task groups such that &
& (nr1x * nr2x) becomes integer multiple of the number of task groups. Otherwise restrict number of MPI tasks &
& up to the number of electronic states.',1)
......@@ -284,7 +283,7 @@ CONTAINS
& or equal to the electronic bands. Otherwise, change ecutwfc to make (nr1x * nr2x) an even number.',1)
!
! to fix this issue. see file exx_psi.f90, exx_gs.f90
IF((nproc_image.GT.nbsp).AND.MOD(nbsp,2*dtgs%nogrp).NE.0) CALL errore('exx_module','EXX calculation error : &
IF((nproc_image.GT.nbsp).AND.MOD(nbsp,2*dffts%nproc2).NE.0) CALL errore('exx_module','EXX calculation error : &
& number of electronic states is not integer multiple of two times the number of task groups. &
& Either change the number of taskgroups or restrict number of MPI tasks up to the number of electronic states.',1)
!
......@@ -302,7 +301,7 @@ CONTAINS
write(stdout,*) "You may want to use number of MPI tasks = ", CEILING(DBLE(2.0*dfftp%nr3)/DBLE(nbsp))*nbsp,&
& "(combined with -ntg 2)"
!
ELSE IF (NINT(2**(LOG(DBLE(INT(nproc_image / dfftp%nr3))) / LOG(2.0))).GT.dtgs%nogrp) THEN
ELSE IF (NINT(2**(LOG(DBLE(INT(nproc_image / dfftp%nr3))) / LOG(2.0))).GT.dffts%nproc2) THEN
!
write(stdout,*)
write(stdout,*) "**********************************************************************************************"
......@@ -312,7 +311,7 @@ CONTAINS
NINT(2**(LOG(DBLE(INT(nproc_image / dfftp%nr3))) / LOG(2.0)))
END IF
!
IF(dtgs%nogrp.EQ.1) THEN
IF(dffts%nproc2.EQ.1) THEN
!
write(stdout,*)
write(stdout,*) "**********************************************************************************************"
......@@ -329,9 +328,9 @@ CONTAINS
& One needs number of task groups = 2^n where n is a positive integer when number of MPI tasks is greater than &
& the number of electronic states. See above for Possible Solutions',1)
!
ELSE IF (NINT(2**(LOG(DBLE(dtgs%nogrp)) / LOG(2.0))).NE.dtgs%nogrp) THEN
ELSE IF (NINT(2**(LOG(DBLE(dffts%nproc2)) / LOG(2.0))).NE.dffts%nproc2) THEN
!
! NINT(2**(LOG(DBLE(dtgs%nogrp)) / LOG(2.0))) is the largest power of 2 that is smaller or equal to dffts%nogrp
! NINT(2**(LOG(DBLE(dffts%nproc2)) / LOG(2.0))) is the largest power of 2 that is smaller or equal to dffts%nogrp
!
CALL errore('exx_module','EXX calculation error : &
& One needs number of task groups = 2^n where n is a positive integer when number of MPI tasks is greater than &
......@@ -340,7 +339,7 @@ CONTAINS
!
END IF
!
IF((dtgs%nogrp.GT.1).AND.(dfftp%nr3*dtgs%nogrp.GT.nproc_image)) CALL errore('exx_module','EXX calculation error : &
IF((dffts%nproc2.GT.1).AND.(dfftp%nr3*dffts%nproc2.GT.nproc_image)) CALL errore('exx_module','EXX calculation error : &
& (nr3x * number of taskgroups) is greater than the number of MPI tasks. Change the number of MPI tasks or the number &
& of taskgroups or both. To estimate ntg, find the value of nr3x in the output and compute (MPI task/nr3x) and take &
& the integer value.',1)
......@@ -377,21 +376,21 @@ CONTAINS
!
IF (nproc_image .LT. nbsp) THEN
!
ALLOCATE( exx_potential(dffts%nr1*dffts%nr2*dffts%npp(me_bgrp+1),nbsp) )
ALLOCATE( exx_potential(dffts%nr1*dffts%nr2*dffts%my_nr3p,nbsp) )
!
ELSE
!
IF ( dtgs%have_task_groups ) THEN
IF ( dffts%have_task_groups ) THEN
!
ALLOCATE( exx_potential(dffts%nnr,nproc_image/dtgs%nogrp) )
ALLOCATE( exx_potential(dffts%nnr,nproc_image/dffts%nproc2) )
!
IF(MOD(nproc_image,dtgs%nogrp).NE.0) CALL errore &
IF(MOD(nproc_image,dffts%nproc2).NE.0) CALL errore &
& ('exx_module','EXX calculation is not working when &
& number of MPI tasks (nproc_image) is not integer multiple of number of taskgroups',1)
!
ELSE
!
ALLOCATE( exx_potential(dffts%nr1x*dffts%nr2x*dffts%npp(me_bgrp+1),nproc_image) ) !
ALLOCATE( exx_potential(dffts%nr1x*dffts%nr2x*dffts%my_nr3p,nproc_image) ) !
!
END IF
!
......@@ -491,7 +490,7 @@ CONTAINS
!
! ** Note that in this case ..
! this is incorrect: my_nxyz(:) = nr1*nr2*dffts%npp(me_image+1)
my_nxyz(:) = nr1*nr2*dffts%npp
my_nxyz(:) = nr1*nr2*dffts%nr3p
!
!DEBUG
!WRITE(stdout,'("my_nbsp")')
......
......@@ -8,7 +8,7 @@ SUBROUTINE exx_psi(c, psitot2,nnrtot,my_nbsp, my_nxyz, nbsp)
!
USE kinds, ONLY : DP
USE fft_interfaces, ONLY : invfft
USE fft_base, ONLY : dffts, dfftp, dtgs
USE fft_base, ONLY : dffts, dfftp
USE gvecw, ONLY : ngw
USE mp_global, ONLY : nproc_image, me_image,intra_image_comm
USE cell_base, ONLY : omega
......@@ -47,9 +47,9 @@ SUBROUTINE exx_psi(c, psitot2,nnrtot,my_nbsp, my_nxyz, nbsp)
nr1s=dfftp%nr1; nr2s=dfftp%nr2; nr3s=dfftp%nr3
!
IF (nproc_image .GE. nr3s) THEN
sizefft=MAX(nr1s*nr2s,dffts%npp(me)*nr1s*nr2s)
sizefft=MAX(nr1s*nr2s,nr1s*nr2s*dffts%my_nr3p)
ELSE
sizefft=dffts%npp(me)*nr1s*nr2s
sizefft=nr1s*nr2s*dffts%my_nr3p
END IF
!
!write(*,'(10I7)') me, dffts%npp(me), dfftp%npp(me), dffts%tg_npp(me), dfftp%nr1x, dfftp%nr2x, dfftp%nr3x, dffts%nnr, dfftp%nnr, dffts%tg_nnr
......@@ -105,7 +105,7 @@ SUBROUTINE exx_psi(c, psitot2,nnrtot,my_nbsp, my_nxyz, nbsp)
!write(stdout,'("dffts%nnr*nogrp:",I10)'), dffts%nnr*nogrp
!write(stdout,'("nogrp*nr3 should be smaller or equal to nproc_image:")')
!
nogrp = dtgs%nogrp
nogrp = dffts%nproc2
!
ALLOCATE( sdispls(nproc_image), sendcount(nproc_image) ); sdispls=0; sendcount=0
ALLOCATE( rdispls(nproc_image), recvcount(nproc_image) ); rdispls=0; recvcount=0
......@@ -168,14 +168,14 @@ SUBROUTINE exx_psi(c, psitot2,nnrtot,my_nbsp, my_nxyz, nbsp)
!
END DO
!
CALL invfft( 'Wave', psis, dffts, dtgs )
CALL invfft( 'tgWave', psis, dffts )
!
#if defined(__MPI)
!
CALL mp_barrier( dtgs%ogrp_comm )
CALL mp_barrier( dffts%comm2 )
CALL MPI_ALLTOALLV(psis, sendcount1, sdispls1, MPI_DOUBLE_COMPLEX, &
& psis1, recvcount1, rdispls1, MPI_DOUBLE_COMPLEX, &
& dtgs%ogrp_comm, ierr)
& dffts%comm2, ierr)
#endif
!
ngpww1 = 0
......
......@@ -274,24 +274,29 @@
ibig3=1+MOD(ibig3-1,dfftp%nr3)
IF(ibig3.LT.1.OR.ibig3.GT.dfftp%nr3) &
& CALL errore('box2grid','ibig3 wrong',ibig3)
ibig3=ibig3-dfftp%ipp(me)
IF ( ibig3 .GT. 0 .AND. ibig3 .LE. ( dfftp%npp(me) ) ) THEN
ibig3=ibig3-dfftp%my_i0r3p
IF ( ibig3 .GT. 0 .AND. ibig3 .LE. ( dfftp%my_nr3p ) ) THEN
DO ir2=1,dfftb%nr2
ibig2=irb(2)+ir2-1
ibig2=1+MOD(ibig2-1,dfftp%nr2)
IF(ibig2.LT.1.OR.ibig2.GT.dfftp%nr2) &
& CALL errore('box2grid','ibig2 wrong',ibig2)
DO ir1=1,dfftb%nr1
ibig1=irb(1)+ir1-1
ibig1=1+MOD(ibig1-1,dfftp%nr1)
IF(ibig1.LT.1.OR.ibig1.GT.dfftp%nr1) &
& CALL errore('box2grid','ibig1 wrong',ibig1)
ibig=ibig1+(ibig2-1)*dfftp%nr1x+(ibig3-1)*dfftp%nr1x*dfftp%nr2x
ir=ir1+(ir2-1)*dfftb%nr1x+(ir3-1)*dfftb%nr1x*dfftb%nr2x
ibig2=ibig2-dfftp%my_i0r2p
IF ( ibig2 .GT. 0 .AND. ibig2 .LE. ( dfftp%my_nr2p ) ) THEN
DO ir1=1,dfftb%nr1
ibig1=irb(1)+ir1-1
ibig1=1+MOD(ibig1-1,dfftp%nr1)
IF(ibig1.LT.1.OR.ibig1.GT.dfftp%nr1) &
& CALL errore('box2grid','ibig1 wrong',ibig1)
ibig=ibig1+(ibig2-1)*dfftp%nr1x+(ibig3-1)*dfftp%nr1x*dfftp%my_nr2p
ir=ir1+(ir2-1)*dfftb%nr1x+(ir3-1)*dfftb%nr1x*dfftb%nr2x
!$omp critical
vr(ibig) = vr(ibig)+qv(nfft,ir)
vr(ibig) = vr(ibig)+qv(nfft,ir)
!$omp end critical
END DO
END DO
END IF
END DO
END IF
END DO
......@@ -327,22 +332,27 @@
ibig3=1+MOD(ibig3-1,dfftp%nr3)
IF(ibig3.LT.1.OR.ibig3.GT.dfftp%nr3) &
& CALL errore('box2grid2','ibig3 wrong',ibig3)
ibig3=ibig3-dfftp%ipp(me)
IF (ibig3.GT.0.AND.ibig3.LE. dfftp%npp(me) ) THEN
ibig3=ibig3-dfftp%my_i0r3p
IF (ibig3.GT.0.AND.ibig3.LE. dfftp%my_nr3p ) THEN
DO ir2=1,dfftb%nr2
ibig2=irb(2)+ir2-1
ibig2=1+MOD(ibig2-1,dfftp%nr2)
IF(ibig2.LT.1.OR.ibig2.GT.dfftp%nr2) &
& CALL errore('box2grid2','ibig2 wrong',ibig2)
DO ir1=1,dfftb%nr1
ibig1=irb(1)+ir1-1
ibig1=1+MOD(ibig1-1,dfftp%nr1)
IF(ibig1.LT.1.OR.ibig1.GT.dfftp%nr1) &
& CALL errore('box2grid2','ibig1 wrong',ibig1)
ibig=ibig1+(ibig2-1)*dfftp%nr1x+(ibig3-1)*dfftp%nr1x*dfftp%nr2x
ir=ir1+(ir2-1)*dfftb%nr1x+(ir3-1)*dfftb%nr1x*dfftb%nr2x
v(ibig) = v(ibig)+qv(ir)
END DO
ibig2=ibig2-dfftp%my_i0r2p
IF (ibig2.GT.0.AND.ibig2.LE. dfftp%my_nr2p ) THEN
DO ir1=1,dfftb%nr1
ibig1=irb(1)+ir1-1
ibig1=1+MOD(ibig1-1,dfftp%nr1)
IF(ibig1.LT.1.OR.ibig1.GT.dfftp%nr1) &
& CALL errore('box2grid2','ibig1 wrong',ibig1)
ibig=ibig1+(ibig2-1)*dfftp%nr1x+(ibig3-1)*dfftp%nr1x*dfftp%my_nr2p
ir=ir1+(ir2-1)*dfftb%nr1x+(ir3-1)*dfftb%nr1x*dfftb%nr2x
v(ibig) = v(ibig)+qv(ir)
END DO
END IF
END DO
END IF
END DO
......@@ -381,18 +391,21 @@
DO ir3=1,dfftb%nr3
ibig3=irb(3)+ir3-1
ibig3=1+MOD(ibig3-1,dfftp%nr3)
ibig3=ibig3-dfftp%ipp(me)
IF (ibig3.GT.0.AND.ibig3.LE. dfftp%npp(me) ) THEN
ibig3=ibig3-dfftp%my_i0r3p
IF (ibig3.GT.0.AND.ibig3.LE. dfftp%my_nr3p ) THEN
DO ir2=1,dfftb%nr2
ibig2=irb(2)+ir2-1
ibig2=1+MOD(ibig2-1,dfftp%nr2)
DO ir1=1,dfftb%nr1
ibig1=irb(1)+ir1-1
ibig1=1+MOD(ibig1-1,dfftp%nr1)
ibig=ibig1 + (ibig2-1)*dfftp%nr1x + (ibig3-1)*dfftp%nr1x*dfftp%nr2x
ir =ir1 + (ir2-1)*dfftb%nr1x + (ir3-1)*dfftb%nr1x*dfftb%nr2x
boxdotgrid = boxdotgrid + qv(nfft,ir)*vr(ibig)
END DO
ibig2=ibig2-dfftp%my_i0r2p
IF (ibig2.GT.0.AND.ibig2.LE. dfftp%my_nr2p ) THEN
DO ir1=1,dfftb%nr1
ibig1=irb(1)+ir1-1
ibig1=1+MOD(ibig1-1,dfftp%nr1)
ibig=ibig1 + (ibig2-1)*dfftp%nr1x + (ibig3-1)*dfftp%nr1x*dfftp%my_nr2p
ir =ir1 + (ir2-1)*dfftb%nr1x + (ir3-1)*dfftb%nr1x*dfftb%nr2x
boxdotgrid = boxdotgrid + qv(nfft,ir)*vr(ibig)
END DO
ENDIF
END DO
ENDIF
END DO
......@@ -402,36 +415,36 @@
!
!----------------------------------------------------------------------
subroutine parabox(nr3b,irb3,nr3,imin3,imax3)
!----------------------------------------------------------------------
!
! find if box grid planes in the z direction have component on the dense
! grid on this processor, and if, which range imin3-imax3
!
use mp_global, only: me_bgrp
use fft_base, only: dfftp
! input
integer nr3b,irb3,nr3
! output
integer imin3,imax3
! local
integer ir3, ibig3, me
!!----------------------------------------------------------------------
! subroutine parabox(nr3b,irb3,nr3,imin3,imax3)
!!----------------------------------------------------------------------
!!
!! find if box grid planes in the z direction have component on the dense
!! grid on this processor, and if, which range imin3-imax3
!!
! use mp_global, only: me_bgrp
! use fft_base, only: dfftp
!! input
! integer nr3b,irb3,nr3
!! output
! integer imin3,imax3
!! local
! integer ir3, ibig3, me
!!
! me = me_bgrp + 1
! imin3=nr3b
! imax3=1
! do ir3=1,nr3b
! ibig3=1+mod(irb3+ir3-2,nr3)
! if(ibig3.lt.1.or.ibig3.gt.nr3) &
! & call errore('cfftpb','ibig3 wrong',ibig3)
! ibig3=ibig3-dfftp%my_i0r3p
! if (ibig3.gt.0.and.ibig3.le.dfftp%my_nr3p) then