Optimize GPU matvec performance

An initial GPU run of the laplace example (~410k dof) shows the following performance on a 3070Ti:

$ ./exset mx 128 my 128 nxbl 4 nybl 4 pd 5
$ ./laplace nstep 3 stat_con 0 dt 0.1 time_wait 1
...
timing statistics in seconds:
function name                   tag     # calls  inclusive   exclusive
laplace                         laplace        1 5.90145E+00 6.11000E-04
qp_update_real_2D               h1_rect_      48 6.86330E-02 6.86330E-02
create_matrix_real              finite_e       3 7.23275E-01 6.96000E-04
diff_op                         diff_op       48 6.33046E-01 6.33046E-01
zero_real                       mat_rect      48 3.53500E-03 3.53500E-03
assemble_real                   mat_rect      48 8.38690E-02 1.23000E-04
create_vector_real              finite_e       3 2.83200E-02 4.65000E-04
diff_rhs                        diff_rhs      48 1.94070E-02 1.94070E-02
assemble_real                   vec_rect      48 4.31100E-03 4.31100E-03
edge_load_arr_real              vec_rect    2032 6.66450E-02 6.66450E-02
edge_network                    seam         127 1.12670E-02 1.12670E-02
edge_unload_arr_real            vec_rect    2032 5.92840E-02 5.92840E-02
iter_cg_real_dir_solve          iter_cg_       3 4.20254E+00 1.35200E-03
iter_init                       iter_cg_       3 1.72206E-01 5.40000E-05
norm_real                       linalg_u     112 2.18545E-01 3.81000E-04
inf_norm_real                   vec_rect    1984 2.41780E-01 2.41780E-01
resid_norm_real                 linalg_u      12 3.07568E-01 1.26000E-04
matvec_real                     mat_rect    1936 2.62790E+00 4.83800E-03
matvec_kernel                   mat_rect   30976 2.59662E+00 2.59662E+00
add_vec_real                    vec_rect    5424 2.67759E-01 2.67759E-01
assign_rvec_real                vec_rect    1872 3.01090E-02 3.01090E-02
dot_real                        linalg_u     218 8.85907E-01 6.72000E-04
dot_real                        vec_rect    3488 8.85235E-01 8.85235E-01
matvec_real                     linalg_u     109 2.48511E+00 7.02000E-04
iter_dealloc                    iter_cg_       3 5.05100E-03 4.50000E-05
dealloc_real                    vec_rect     272 5.73700E-03 5.73700E-03
xfer_rvector_to_1fem            xfer_vec      48 3.66000E-03 3.66000E-03

Here's the performance of a bigger case (~2.36 million dof) on a Perlmutter node (4xA100s):

./exset mx 256 my 256 nxbl 2 nybl 2 pd 6
srun -n 4 --gpu-bind=map_gpu:0,1,2,3 ./laplace nstep 3 stat_con 1 dt 0.1 time_wait 1
...
timing statistics in seconds:
function name                   tag     # calls  inclusive   exclusive
laplace                         laplace        1 2.84277E+00 2.35562E-03
dump_read                       dump           1 5.36118E-01 4.99223E-01
h5_read                         seam           1 5.83375E-04 6.55770E-05
h5_read                         edge           2 5.17798E-04 5.17798E-04
h5_read                         rblock         1 3.63123E-02 2.13720E-05
h5_read_global_block_data       gblock         1 3.62909E-02 1.93002E-04
h5_read_real_2D                 h1_rect_       2 3.60979E-02 1.22196E-02
alloc_real_2D                   h1_rect_       3 2.42946E-02 2.42946E-02
block_intg_formula_set          rblock         1 1.07600E-05 1.07600E-05
block_metric_set                rblock         1 4.65189E-01 3.68364E-02
eval_many_real_2D               h1_rect_   16384 4.28353E-01 1.22478E-01
h1_rect_2D_bases_many           h1_rect_   16385 2.88236E-01 2.88236E-01
get_element_dofs                h1_rect_   16384 1.76755E-02 1.76755E-02
move_field_to_real              nodal          1 5.78986E-04 6.91400E-06
alloc_with_mold_real_2D         h1_rect_       1 4.20320E-04 4.03700E-06
assign_field_real_2D            h1_rect_       1 7.87610E-05 7.87610E-05
dealloc_real_2D                 h1_rect_       3 1.93793E-04 1.93793E-04
init                            seam           1 1.39138E-01 9.82485E-02
init                            edge           1 4.08892E-02 4.08892E-02
alloc_rvector_for_1fem          alloc_li       1 6.48150E-05 1.04000E-05
alloc_real                      vec_rect      20 3.81770E-03 3.81770E-03
alloc_with_mold_real            vec_rect      19 3.80884E-03 4.55550E-05
zero_real                       vec_rect     121 3.17828E-03 3.17828E-03
set_edge_vars_real              vec_rect       1 1.01766E-04 1.01766E-04
alloc_rmatrix_for_1fem          alloc_li       1 4.92813E-02 7.52500E-06
alloc_real                      mat_rect       1 4.92738E-02 4.92738E-02
qp_alloc_real_2D                h1_rect_       1 1.51334E-02 1.51334E-02
init_basis_ftn_real_2D          h1_rect_       1 7.93830E-05 4.24110E-05
compute_alpha_real              nodal          1 2.87988E-01 1.42088E-01
alpha_eval_real_2D              h1_rect_       1 3.70196E-02 3.70196E-02
alpha_deriv_real_2D             h1_rect_       1 1.08881E-01 1.08881E-01
qp_update_real_2D               h1_rect_       3 2.78354E-02 2.78354E-02
create_matrix_real              finite_e       3 7.11247E-01 1.37712E-03
diff_op                         diff_op        3 2.82738E-01 2.82738E-01
zero_real                       mat_rect       3 2.38689E-03 2.38689E-03
assemble_real                   mat_rect       3 1.79687E-01 1.98140E-05
assemble_gg_real                mat_rect       3 2.77007E-03 2.77007E-03
assemble_hg_real                mat_rect       3 2.28565E-03 2.28565E-03
assemble_vg_real                mat_rect       3 2.44717E-03 2.44717E-03
assemble_ig_real                mat_rect       3 2.48300E-02 2.48300E-02
assemble_gh_real                mat_rect       3 1.68895E-03 1.68895E-03
assemble_hh_real                mat_rect       3 1.36404E-03 1.36404E-03
assemble_vh_real                mat_rect       3 1.43558E-03 1.43558E-03
assemble_ih_real                mat_rect       3 6.61731E-02 6.61731E-02
assemble_gv_real                mat_rect       3 2.03216E-03 2.03216E-03
assemble_hv_real                mat_rect       3 1.52979E-03 1.52979E-03
assemble_vv_real                mat_rect       3 1.40194E-03 1.40194E-03
assemble_iv_real                mat_rect       3 6.57051E-02 6.57051E-02
assemble_gi_real                mat_rect       3 1.15018E-03 1.15018E-03
assemble_hi_real                mat_rect       3 1.38964E-03 1.38964E-03
assemble_vi_real                mat_rect       3 1.39528E-03 1.39528E-03
assemble_ii_real                mat_rect       3 2.06864E-03 2.06864E-03
find_diag_scale_real            mat_rect       3 2.47656E-04 2.47656E-04
elim_inv_int_real               mat_rect       3 2.44810E-01 2.44810E-01
create_vector_real              finite_e       3 2.21070E-02 7.90850E-05
diff_rhs                        diff_rhs       3 7.11390E-03 7.11390E-03
assemble_real                   vec_rect       3 2.52159E-03 2.52159E-03
assign_rvec_real                vec_rect     111 2.35216E-03 2.35216E-03
elim_presolve_real              mat_rect       3 1.08206E-02 1.16497E-04
matvec_kernel                   mat_rect    1032 4.09779E-01 4.09779E-01
dealloc_real                    vec_rect      20 5.85879E-04 5.85879E-04
edge_load_arr_real              vec_rect     118 7.21517E-03 7.21517E-03
edge_network                    seam         118 9.42085E-03 9.42085E-03
edge_unload_arr_real            vec_rect     118 5.89777E-03 5.89777E-03
iter_cg_real_dir_solve          iter_cg_       3 4.84240E-01 5.30586E-04
iter_init                       iter_cg_       3 2.67704E-02 9.04310E-05
norm_real                       linalg_u     103 1.54861E-02 1.66915E-03
inf_norm_real                   vec_rect     115 1.54400E-02 1.54400E-02
resid_norm_real                 linalg_u      12 4.48441E-02 1.28011E-04
matvec_real                     mat_rect     112 3.73495E-01 2.62325E-04
add_vec_real                    vec_rect     312 1.77225E-02 1.77225E-02
dot_real                        linalg_u     200 4.77635E-02 1.16397E-03
dot_real                        vec_rect     200 4.65995E-02 4.65995E-02
matvec_real                     linalg_u     100 3.52406E-01 1.88685E-04
iter_dealloc                    iter_cg_       3 4.47393E-04 1.77250E-05
elim_postsolve_real             mat_rect       3 2.87444E-02 9.54950E-05
xfer_rvector_to_1fem            xfer_vec       3 3.94883E-04 3.94883E-04
dump_write                      dump           1 3.63171E-02 1.67070E-02
h5_dump                         seam           1 3.07854E-04 2.80950E-05
h5_dump                         edge           2 2.79759E-04 2.79759E-04
h5_write                        rblock         1 1.93023E-02 4.73910E-05
h5_write_global_block_data      gblock         1 1.92549E-02 3.02190E-05
h5_dump_real_2D                 h1_rect_       2 1.92247E-02 1.92247E-02
dealloc                         seam           1 3.40750E-02 1.44590E-05
dealloc                         edge           2 3.40606E-02 3.40606E-02
dealloc_real                    mat_rect       1 7.59113E-04 7.59113E-04
dealloc                         rblock         1 5.78920E-05 1.76400E-06
dealloc_global_block_data       gblock         1 5.61280E-05 2.00400E-06
  rhs_norm          expected          exact
   7.390835426E-06   7.390835426E-06   7.390835426E-06
   4.403336722E-06  -4.403336721E-06   2.751957660E-09
   2.623434749E-06   2.623434722E-06   1.024684021E-12
normalized error =  1.047E-08
ok - calculation agrees with expectation
Edited by Jacob King