Optimize GPU matvec performance
An initial GPU run of the laplace example (~410k dof) shows the following performance on a 3070Ti:
$ ./exset mx 128 my 128 nxbl 4 nybl 4 pd 5
$ ./laplace nstep 3 stat_con 0 dt 0.1 time_wait 1
...
timing statistics in seconds:
function name tag # calls inclusive exclusive
laplace laplace 1 5.90145E+00 6.11000E-04
qp_update_real_2D h1_rect_ 48 6.86330E-02 6.86330E-02
create_matrix_real finite_e 3 7.23275E-01 6.96000E-04
diff_op diff_op 48 6.33046E-01 6.33046E-01
zero_real mat_rect 48 3.53500E-03 3.53500E-03
assemble_real mat_rect 48 8.38690E-02 1.23000E-04
create_vector_real finite_e 3 2.83200E-02 4.65000E-04
diff_rhs diff_rhs 48 1.94070E-02 1.94070E-02
assemble_real vec_rect 48 4.31100E-03 4.31100E-03
edge_load_arr_real vec_rect 2032 6.66450E-02 6.66450E-02
edge_network seam 127 1.12670E-02 1.12670E-02
edge_unload_arr_real vec_rect 2032 5.92840E-02 5.92840E-02
iter_cg_real_dir_solve iter_cg_ 3 4.20254E+00 1.35200E-03
iter_init iter_cg_ 3 1.72206E-01 5.40000E-05
norm_real linalg_u 112 2.18545E-01 3.81000E-04
inf_norm_real vec_rect 1984 2.41780E-01 2.41780E-01
resid_norm_real linalg_u 12 3.07568E-01 1.26000E-04
matvec_real mat_rect 1936 2.62790E+00 4.83800E-03
matvec_kernel mat_rect 30976 2.59662E+00 2.59662E+00
add_vec_real vec_rect 5424 2.67759E-01 2.67759E-01
assign_rvec_real vec_rect 1872 3.01090E-02 3.01090E-02
dot_real linalg_u 218 8.85907E-01 6.72000E-04
dot_real vec_rect 3488 8.85235E-01 8.85235E-01
matvec_real linalg_u 109 2.48511E+00 7.02000E-04
iter_dealloc iter_cg_ 3 5.05100E-03 4.50000E-05
dealloc_real vec_rect 272 5.73700E-03 5.73700E-03
xfer_rvector_to_1fem xfer_vec 48 3.66000E-03 3.66000E-03
Here's the performance of a bigger case (~2.36 million dof) on a Perlmutter node (4xA100s):
./exset mx 256 my 256 nxbl 2 nybl 2 pd 6
srun -n 4 --gpu-bind=map_gpu:0,1,2,3 ./laplace nstep 3 stat_con 1 dt 0.1 time_wait 1
...
timing statistics in seconds:
function name tag # calls inclusive exclusive
laplace laplace 1 2.84277E+00 2.35562E-03
dump_read dump 1 5.36118E-01 4.99223E-01
h5_read seam 1 5.83375E-04 6.55770E-05
h5_read edge 2 5.17798E-04 5.17798E-04
h5_read rblock 1 3.63123E-02 2.13720E-05
h5_read_global_block_data gblock 1 3.62909E-02 1.93002E-04
h5_read_real_2D h1_rect_ 2 3.60979E-02 1.22196E-02
alloc_real_2D h1_rect_ 3 2.42946E-02 2.42946E-02
block_intg_formula_set rblock 1 1.07600E-05 1.07600E-05
block_metric_set rblock 1 4.65189E-01 3.68364E-02
eval_many_real_2D h1_rect_ 16384 4.28353E-01 1.22478E-01
h1_rect_2D_bases_many h1_rect_ 16385 2.88236E-01 2.88236E-01
get_element_dofs h1_rect_ 16384 1.76755E-02 1.76755E-02
move_field_to_real nodal 1 5.78986E-04 6.91400E-06
alloc_with_mold_real_2D h1_rect_ 1 4.20320E-04 4.03700E-06
assign_field_real_2D h1_rect_ 1 7.87610E-05 7.87610E-05
dealloc_real_2D h1_rect_ 3 1.93793E-04 1.93793E-04
init seam 1 1.39138E-01 9.82485E-02
init edge 1 4.08892E-02 4.08892E-02
alloc_rvector_for_1fem alloc_li 1 6.48150E-05 1.04000E-05
alloc_real vec_rect 20 3.81770E-03 3.81770E-03
alloc_with_mold_real vec_rect 19 3.80884E-03 4.55550E-05
zero_real vec_rect 121 3.17828E-03 3.17828E-03
set_edge_vars_real vec_rect 1 1.01766E-04 1.01766E-04
alloc_rmatrix_for_1fem alloc_li 1 4.92813E-02 7.52500E-06
alloc_real mat_rect 1 4.92738E-02 4.92738E-02
qp_alloc_real_2D h1_rect_ 1 1.51334E-02 1.51334E-02
init_basis_ftn_real_2D h1_rect_ 1 7.93830E-05 4.24110E-05
compute_alpha_real nodal 1 2.87988E-01 1.42088E-01
alpha_eval_real_2D h1_rect_ 1 3.70196E-02 3.70196E-02
alpha_deriv_real_2D h1_rect_ 1 1.08881E-01 1.08881E-01
qp_update_real_2D h1_rect_ 3 2.78354E-02 2.78354E-02
create_matrix_real finite_e 3 7.11247E-01 1.37712E-03
diff_op diff_op 3 2.82738E-01 2.82738E-01
zero_real mat_rect 3 2.38689E-03 2.38689E-03
assemble_real mat_rect 3 1.79687E-01 1.98140E-05
assemble_gg_real mat_rect 3 2.77007E-03 2.77007E-03
assemble_hg_real mat_rect 3 2.28565E-03 2.28565E-03
assemble_vg_real mat_rect 3 2.44717E-03 2.44717E-03
assemble_ig_real mat_rect 3 2.48300E-02 2.48300E-02
assemble_gh_real mat_rect 3 1.68895E-03 1.68895E-03
assemble_hh_real mat_rect 3 1.36404E-03 1.36404E-03
assemble_vh_real mat_rect 3 1.43558E-03 1.43558E-03
assemble_ih_real mat_rect 3 6.61731E-02 6.61731E-02
assemble_gv_real mat_rect 3 2.03216E-03 2.03216E-03
assemble_hv_real mat_rect 3 1.52979E-03 1.52979E-03
assemble_vv_real mat_rect 3 1.40194E-03 1.40194E-03
assemble_iv_real mat_rect 3 6.57051E-02 6.57051E-02
assemble_gi_real mat_rect 3 1.15018E-03 1.15018E-03
assemble_hi_real mat_rect 3 1.38964E-03 1.38964E-03
assemble_vi_real mat_rect 3 1.39528E-03 1.39528E-03
assemble_ii_real mat_rect 3 2.06864E-03 2.06864E-03
find_diag_scale_real mat_rect 3 2.47656E-04 2.47656E-04
elim_inv_int_real mat_rect 3 2.44810E-01 2.44810E-01
create_vector_real finite_e 3 2.21070E-02 7.90850E-05
diff_rhs diff_rhs 3 7.11390E-03 7.11390E-03
assemble_real vec_rect 3 2.52159E-03 2.52159E-03
assign_rvec_real vec_rect 111 2.35216E-03 2.35216E-03
elim_presolve_real mat_rect 3 1.08206E-02 1.16497E-04
matvec_kernel mat_rect 1032 4.09779E-01 4.09779E-01
dealloc_real vec_rect 20 5.85879E-04 5.85879E-04
edge_load_arr_real vec_rect 118 7.21517E-03 7.21517E-03
edge_network seam 118 9.42085E-03 9.42085E-03
edge_unload_arr_real vec_rect 118 5.89777E-03 5.89777E-03
iter_cg_real_dir_solve iter_cg_ 3 4.84240E-01 5.30586E-04
iter_init iter_cg_ 3 2.67704E-02 9.04310E-05
norm_real linalg_u 103 1.54861E-02 1.66915E-03
inf_norm_real vec_rect 115 1.54400E-02 1.54400E-02
resid_norm_real linalg_u 12 4.48441E-02 1.28011E-04
matvec_real mat_rect 112 3.73495E-01 2.62325E-04
add_vec_real vec_rect 312 1.77225E-02 1.77225E-02
dot_real linalg_u 200 4.77635E-02 1.16397E-03
dot_real vec_rect 200 4.65995E-02 4.65995E-02
matvec_real linalg_u 100 3.52406E-01 1.88685E-04
iter_dealloc iter_cg_ 3 4.47393E-04 1.77250E-05
elim_postsolve_real mat_rect 3 2.87444E-02 9.54950E-05
xfer_rvector_to_1fem xfer_vec 3 3.94883E-04 3.94883E-04
dump_write dump 1 3.63171E-02 1.67070E-02
h5_dump seam 1 3.07854E-04 2.80950E-05
h5_dump edge 2 2.79759E-04 2.79759E-04
h5_write rblock 1 1.93023E-02 4.73910E-05
h5_write_global_block_data gblock 1 1.92549E-02 3.02190E-05
h5_dump_real_2D h1_rect_ 2 1.92247E-02 1.92247E-02
dealloc seam 1 3.40750E-02 1.44590E-05
dealloc edge 2 3.40606E-02 3.40606E-02
dealloc_real mat_rect 1 7.59113E-04 7.59113E-04
dealloc rblock 1 5.78920E-05 1.76400E-06
dealloc_global_block_data gblock 1 5.61280E-05 2.00400E-06
rhs_norm expected exact
7.390835426E-06 7.390835426E-06 7.390835426E-06
4.403336722E-06 -4.403336721E-06 2.751957660E-09
2.623434749E-06 2.623434722E-06 1.024684021E-12
normalized error = 1.047E-08
ok - calculation agrees with expectation
Edited by Jacob King