Fix OpenACC

2024-12-22 20:36:01 +01:00 · 2022-04-06 10:42:00 +02:00 · 2022-04-06 10:42:00 +02:00 · 0966e1e2b1
commit 0966e1e2b1
parent 72fad819bf
1 changed files with 540 additions and 520 deletions
--- a/org/qmckl_jastrow.org
+++ b/org/qmckl_jastrow.org
@ -5689,20 +5689,29 @@ qmckl_exit_code qmckl_compute_tmp_c_acc_offload (

  // Compute array access strides:
  // For tmp_c...
-  int stride_k_c  = elec_num;
-  int stride_j_c  = stride_k_c * nucl_num;
-  int stride_i_c  = stride_j_c * (cord_num+1);
-  int stride_nw_c = stride_i_c * cord_num;
+  const int64_t stride_k_c  = elec_num;
+  const int64_t stride_j_c  = stride_k_c * nucl_num;
+  const int64_t stride_i_c  = stride_j_c * (cord_num+1);
+  const int64_t stride_nw_c = stride_i_c * cord_num;
  // For een_rescaled_e...
-  int stride_m_e  = elec_num;
-  int stride_i_e  = stride_m_e * elec_num;
-  int stride_nw_e = stride_i_e * (cord_num+1);
+  const int64_t stride_m_e  = elec_num;
+  const int64_t stride_i_e  = stride_m_e * elec_num;
+  const int64_t stride_nw_e = stride_i_e * (cord_num+1);
  // For een_rescaled_n...
-  int stride_k_n  = elec_num;
-  int stride_j_n  = stride_k_n * nucl_num;
-  int stride_nw_n = stride_j_n * (cord_num+1);
+  const int64_t stride_k_n  = elec_num;
+  const int64_t stride_j_n  = stride_k_n * nucl_num;
+  const int64_t stride_nw_n = stride_j_n * (cord_num+1);
+
+  const int64_t size_tmp_c = elec_num*nucl_num*(cord_num+1)*cord_num*walk_num;
+  const int64_t size_e = walk_num*(cord_num+1)*elec_num*elec_num;
+  const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num; 
+
+  #pragma acc parallel create(tmp_c[0:size_tmp_c]) copyout(tmp_c [0:size_tmp_c]) copyin(een_rescaled_e[0:size_e], een_rescaled_n[0:size_n])
+  {
+      #pragma acc loop independent gang worker vector
+      for (int64_t i=0 ; i<size_tmp_c ; ++i)
+          tmp_c[i] = 0.;

-  #pragma acc parallel
      #pragma acc loop independent gang worker vector collapse(5)
      for (int nw=0; nw < walk_num; ++nw) {
        for (int i=0; i<cord_num; ++i){
@ -5726,6 +5735,7 @@ qmckl_exit_code qmckl_compute_tmp_c_acc_offload (
          }
        }
      }
+  }

  return QMCKL_SUCCESS;
 }
@ -6107,7 +6117,7 @@ qmckl_exit_code qmckl_compute_dtmp_c_hpc (

    #+NAME: qmckl_factor_dtmp_c_acc_offload_args
    | Variable                 | Type                                                                | In/Out | Description                                   |
-    |--------------------------+------------------------------------------------------------------+--------+-----------------------------------------------|
+    |--------------------------+---------------------------------------------------------------------+--------+-----------------------------------------------|
    | ~context~                | ~qmckl_context~                                                     | in     | Global state                                  |
    | ~cord_num~               | ~int64_t~                                                           | in     | Order of polynomials                          |
    | ~elec_num~               | ~int64_t~                                                           | in     | Number of electrons                           |
@ -6115,7 +6125,7 @@ qmckl_exit_code qmckl_compute_dtmp_c_hpc (
    | ~walk_num~               | ~int64_t~                                                           | in     | Number of walkers                             |
    | ~een_rescaled_e_deriv_e~ | ~double[walk_num][0:cord_num][elec_num][4][elec_num]~               | in     | Electron-electron rescaled factor derivatives |
    | ~een_rescaled_n~         | ~double[walk_num][0:cord_num][nucl_num][elec_num]~                  | in     | Electron-nucleus rescaled factor              |
-    | ~dtmp_c~                 | ~double[walk_num][0:cord_num-1][0:cord_num][nucl_num][elec_num]~ | out    | vector of non-zero coefficients               |
+    | ~dtmp_c~                 | ~double[walk_num][0:cord_num-1][0:cord_num][nucl_num][4][elec_num]~ | out    | vector of non-zero coefficients               |


    #+begin_src c :comments org :tangle (eval c) :noweb yes
@ -6148,23 +6158,32 @@ qmckl_exit_code qmckl_compute_dtmp_c_acc_offload (

  // Compute strides...
  // For dtmp_c
-  int stride_l_d  = elec_num;
-  int stride_k_d  = stride_l_d * 4;
-  int stride_j_d  = stride_k_d * nucl_num;
-  int stride_i_d  = stride_j_d * (cord_num+1);
-  int stride_nw_d = stride_i_d * cord_num;
+  const int64_t stride_l_d  = elec_num;
+  const int64_t stride_k_d  = stride_l_d * 4;
+  const int64_t stride_j_d  = stride_k_d * nucl_num;
+  const int64_t stride_i_d  = stride_j_d * (cord_num+1);
+  const int64_t stride_nw_d = stride_i_d * cord_num;
  // For een_rescaled_e_deriv_e
-  int stride_l_e  = elec_num;
-  int stride_n_e  = stride_l_e * 4;
-  int stride_i_e  = stride_n_e * elec_num;
-  int stride_nw_e = stride_i_e * cord_num;
+  const int64_t stride_l_e  = elec_num;
+  const int64_t stride_n_e  = stride_l_e * 4;
+  const int64_t stride_i_e  = stride_n_e * elec_num;
+  const int64_t stride_nw_e = stride_i_e * cord_num;
  // For een_rescaled_n
-  int stride_k_n  = elec_num;
-  int stride_j_n  = stride_k_n * nucl_num;
-  int stride_nw_n = stride_j_n * (cord_num+1);
+  const int64_t stride_k_n  = elec_num;
+  const int64_t stride_j_n  = stride_k_n * nucl_num;
+  const int64_t stride_nw_n = stride_j_n * (cord_num+1);


-  #pragma acc parallel
+  const int64_t size_dtmp_c = walk_num*cord_num*(cord_num+1)*nucl_num*4*elec_num;
+  const int64_t size_n = walk_num*(cord_num+1)*nucl_num*elec_num; 
+  const int64_t size_e = walk_num*(cord_num+1)*elec_num*4*elec_num;
+
+  #pragma acc parallel create(dtmp_c[0:size_dtmp_c]) copyout(dtmp_c [0:size_dtmp_c]) copyin(een_rescaled_e_deriv_e[0:size_e], een_rescaled_n[0:size_n])
+  {
+      #pragma acc loop independent gang worker vector
+      for (int64_t i=0 ; i<size_dtmp_c ; ++i)
+          dtmp_c[i] = 0.;
+
      #pragma loop independent gang worker vector collapse(6)
      for (int nw=0; nw < walk_num; nw++) {
        for (int i=0; i < cord_num; i++) {
@ -6189,6 +6208,7 @@ qmckl_exit_code qmckl_compute_dtmp_c_acc_offload (
          }
        }
      }
+  }

  return QMCKL_SUCCESS;
 }