diff --git a/configure.ac b/configure.ac index 968fbf8..56f0ed1 100644 --- a/configure.ac +++ b/configure.ac @@ -49,12 +49,12 @@ AS_IF([test -d ${srcdir}/.git], AC_ARG_WITH(ifort, [AS_HELP_STRING([--with-ifort],[Use Intel Fortran compiler])], with_ifort=$withval, with_ifort=no) AS_IF([test "$with_ifort" == "yes"], [ FC=ifort - FCFLAGS="-xHost -ip -Ofast -ftz -finline -g -mkl=sequential" ]) + FCFLAGS="-march=native -ip -Ofast -ftz -finline -g -mkl=sequential" ]) AC_ARG_WITH(icc, [AS_HELP_STRING([--with-icc],[Use Intel C compiler])], with_icc=$withval, with_icc=no) AS_IF([test "$with_icc" == "yes"], [ CC=icc - CFLAGS="-xHost -ip -Ofast -ftz -finline -g -mkl=sequential" ]) + CFLAGS="-march=native -ip -Ofast -ftz -finline -g -mkl=sequential" ]) AS_IF([test "$with_icc"."$with_ifort" == "yes.yes"], [ ax_blas_ok="yes" diff --git a/org/examples.org b/org/examples.org new file mode 100644 index 0000000..58dc366 --- /dev/null +++ b/org/examples.org @@ -0,0 +1,199 @@ +#+TITLE: Code examples +#+SETUPFILE: ../tools/theme.setup +#+INCLUDE: ../tools/lib.org + +In this section, we present examples of usage of QMCkl. +For simplicity, we assume that the wave function parameters are stores +in a [[https://github.com/TREX-CoE/trexio][TREXIO]] file. + +* Checking errors + + All QMCkl functions return an error code. A convenient way to handle + errors is to write an error-checking function that displays the + error in text format and exits the program. + + #+NAME: qmckl_check_error + #+begin_src f90 +subroutine qmckl_check_error(rc, message) + use qmckl + implicit none + integer(qmckl_exit_code), intent(in) :: rc + character(len=*) , intent(in) :: message + character(len=128) :: str_buffer + if (rc /= QMCKL_SUCCESS) then + print *, message + call qmckl_string_of_error(rc, str_buffer) + print *, str_buffer + call exit(rc) + end if +end subroutine qmckl_check_error + #+end_src + +* Computing an atomic orbital on a grid + :PROPERTIES: + :header-args: :tangle ao_grid.f90 + :END: + + The following program, in Fortran, computes the values of an atomic + orbital on a regular 3-dimensional grid. The 100^3 grid points are + automatically defined, such that the molecule fits in a box with 5 + atomic units in the borders. + + This program uses the ~qmckl_check_error~ function defined above. + + To use this program, run + + #+begin_src bash :tangle no +$ ao_grid + #+end_src + + + #+begin_src f90 :noweb yes +<> + +program ao_grid + use qmckl + implicit none + + integer(qmckl_context) :: qmckl_ctx ! QMCkl context + integer(qmckl_exit_code) :: rc ! Exit code of QMCkl functions + + character(len=128) :: trexio_filename + character(len=128) :: str_buffer + integer :: ao_id + integer :: point_num_x + + integer(c_int64_t) :: nucl_num + double precision, allocatable :: nucl_coord(:,:) + + integer(c_int64_t) :: point_num + integer(c_int64_t) :: ao_num + integer(c_int64_t) :: ipoint, i, j, k + double precision :: x, y, z, dr(3) + double precision :: rmin(3), rmax(3) + double precision, allocatable :: points(:,:) + double precision, allocatable :: ao_vgl(:,:,:) + #+end_src + + Start by fetching the command-line arguments: + + #+begin_src f90 + if (iargc() /= 3) then + print *, 'Syntax: ao_grid ' + call exit(-1) + end if + call getarg(1, trexio_filename) + call getarg(2, str_buffer) + read(str_buffer, *) ao_id + call getarg(3, str_buffer) + read(str_buffer, *) point_num_x + + if (point_num_x < 0 .or. point_num_x > 300) then + print *, 'Error: 0 < point_num < 300' + call exit(-1) + end if + #+end_src + + Create the QMCkl context and initialize it with the wave function + present in the TREXIO file: + + #+begin_src f90 + qmckl_ctx = qmckl_context_create() + rc = qmckl_trexio_read(qmckl_ctx, trexio_filename, 1_8*len(trim(trexio_filename))) + call qmckl_check_error(rc, 'Read TREXIO') + #+end_src + + We need to check that ~ao_id~ is in the range, so we get the total + number of AOs from QMCkl: + + #+begin_src f90 + rc = qmckl_get_ao_basis_ao_num(qmckl_ctx, ao_num) + call qmckl_check_error(rc, 'Getting ao_num') + + if (ao_id < 0 .or. ao_id > ao_num) then + print *, 'Error: 0 < ao_id < ', ao_num + call exit(-1) + end if + #+end_src + + Now we will compute the limits of the box in which the molecule fits. + For that, we first need to ask QMCkl the coordinates of nuclei. + + #+begin_src f90 + rc = qmckl_get_nucleus_num(qmckl_ctx, nucl_num) + call qmckl_check_error(rc, 'Get nucleus num') + + allocate( nucl_coord(3, nucl_num) ) + rc = qmckl_get_nucleus_coord(qmckl_ctx, 'N', nucl_coord, 3_8*nucl_num) + call qmckl_check_error(rc, 'Get nucleus coord') + #+end_src + + We now compute the coordinates of opposite points of the box, and + the distance between points along the 3 directions: + + #+begin_src f90 + rmin(1) = minval( nucl_coord(1,:) ) - 5.d0 + rmin(2) = minval( nucl_coord(2,:) ) - 5.d0 + rmin(3) = minval( nucl_coord(3,:) ) - 5.d0 + + rmax(1) = maxval( nucl_coord(1,:) ) + 5.d0 + rmax(2) = maxval( nucl_coord(2,:) ) + 5.d0 + rmax(3) = maxval( nucl_coord(3,:) ) + 5.d0 + + dr(1:3) = (rmax(1:3) - rmin(1:3)) / dble(point_num_x-1) + #+end_src + + We now produce the list of point coordinates where the AO will be + evaluated: + + #+begin_src f90 + point_num = point_num_x**3 + allocate( points(point_num, 3) ) + ipoint=0 + z = rmin(3) + do k=1,point_num_x + y = rmin(2) + do j=1,point_num_x + x = rmin(1) + do i=1,point_num_x + ipoint = ipoint+1 + points(ipoint,1) = x + points(ipoint,2) = y + points(ipoint,3) = z + x = x + dr(1) + end do + y = y + dr(2) + end do + z = z + dr(3) + end do + #+end_src + + We give the points to QMCkl: + + #+begin_src f90 + rc = qmckl_set_point(qmckl_ctx, 'T', points, point_num) + call qmckl_check_error(rc, 'Setting points') + #+end_src + + We allocate the space required to retrieve the values, gradients and + Laplacian of all AOs, and ask to retrieve the values of the + AOs computed at the point positions. + + #+begin_src f90 + allocate( ao_vgl(ao_num, 5, point_num) ) + rc = qmckl_get_ao_basis_ao_vgl(qmckl_ctx, ao_vgl, ao_num*5_8*point_num) + call qmckl_check_error(rc, 'Setting points') + #+end_src + + We finally print the value of the AO: + + #+begin_src f90 + do ipoint=1, point_num + print '(3(F16.10,X),E20.10)', points(ipoint, 1:3), ao_vgl(ao_id,1,ipoint) + end do + #+end_src + + #+begin_src f90 + deallocate( nucl_coord, points, ao_vgl ) +end program ao_grid + #+end_src diff --git a/org/qmckl_ao.org b/org/qmckl_ao.org index 7cc7eef..54c5319 100644 --- a/org/qmckl_ao.org +++ b/org/qmckl_ao.org @@ -1,4 +1,3 @@ -B #+TITLE: Atomic Orbitals #+SETUPFILE: ../tools/theme.setup #+INCLUDE: ../tools/lib.org @@ -49,6 +48,7 @@ gradients and Laplacian of the atomic basis functions. #+begin_src c :tangle (eval h_private_func) #ifndef QMCKL_AO_HPF #define QMCKL_AO_HPF +#include "qmckl_blas_private_type.h" #+end_src @@ -57,6 +57,8 @@ gradients and Laplacian of the atomic basis functions. #define QMCKL_AO_HPT #include +#include +#include "qmckl_blas_private_type.h" #+end_src #+begin_src f90 :tangle (eval f) :noweb yes @@ -105,6 +107,7 @@ int main() { #include "qmckl.h" #include "qmckl_context_private_type.h" #include "qmckl_memory_private_type.h" +#include "qmckl_blas_private_type.h" #include "qmckl_memory_private_func.h" #include "qmckl_ao_private_type.h" #include "qmckl_ao_private_func.h" @@ -136,6 +139,7 @@ int main() { | ~ao_num~ | ~int64_t~ | Number of AOs | | ~ao_cartesian~ | ~bool~ | If true, use polynomials. Otherwise, use spherical harmonics | | ~ao_factor~ | ~double[ao_num]~ | Normalization factor of the AO | + |---------------------+----------------------+----------------------------------------------------------------------| For H_2 with the following basis set, @@ -271,35 +275,35 @@ end interface #+begin_src c :comments org :tangle (eval h_private_type) :noweb yes typedef struct qmckl_ao_basis_struct { - int64_t shell_num; - int64_t prim_num; - int64_t ao_num; - int64_t * nucleus_index; - int64_t * nucleus_shell_num; - int32_t * shell_ang_mom; - int64_t * shell_prim_num; - int64_t * shell_prim_index; - double * shell_factor; - double * exponent; - double * coefficient; - double * prim_factor; - double * ao_factor; + int64_t shell_num; + int64_t prim_num; + int64_t ao_num; + int64_t * restrict nucleus_index; + int64_t * restrict nucleus_shell_num; + int32_t * restrict shell_ang_mom; + int64_t * restrict shell_prim_num; + int64_t * restrict shell_prim_index; + double * restrict shell_factor; + double * restrict exponent; + double * restrict coefficient; + double * restrict prim_factor; + double * restrict ao_factor; - int64_t * nucleus_prim_index; - double * coefficient_normalized; - int32_t * nucleus_max_ang_mom; - double * nucleus_range; - double * primitive_vgl; - uint64_t primitive_vgl_date; - double * shell_vgl; - uint64_t shell_vgl_date; - double * ao_vgl; - uint64_t ao_vgl_date; + int64_t * restrict nucleus_prim_index; + double * restrict coefficient_normalized; + int32_t * restrict nucleus_max_ang_mom; + double * restrict nucleus_range; + double * restrict primitive_vgl; + uint64_t primitive_vgl_date; + double * restrict shell_vgl; + uint64_t shell_vgl_date; + double * restrict ao_vgl; + uint64_t ao_vgl_date; - int32_t uninitialized; - bool provided; - bool ao_cartesian; - char type; + int32_t uninitialized; + bool provided; + bool ao_cartesian; + char type; #ifdef HAVE_HPC <> #endif @@ -774,14 +778,14 @@ qmckl_set_ao_basis_shell_prim_num (qmckl_context context, } #+end_src - + #+begin_src c :comments org :tangle (eval h_func) qmckl_exit_code qmckl_set_ao_basis_shell_prim_index (qmckl_context context, const int64_t* shell_prim_index, const int64_t size_max); #+end_src - + #+begin_src c :comments org :tangle (eval c) :noweb yes :exports none qmckl_exit_code qmckl_set_ao_basis_shell_prim_index (qmckl_context context, @@ -835,15 +839,15 @@ qmckl_set_ao_basis_shell_prim_index (qmckl_context context, <> } #+end_src - - + + #+begin_src c :comments org :tangle (eval h_func) qmckl_exit_code qmckl_set_ao_basis_shell_factor (qmckl_context context, const double* shell_factor, const int64_t size_max); #+end_src - + #+begin_src c :comments org :tangle (eval c) :noweb yes :exports none qmckl_exit_code qmckl_set_ao_basis_shell_factor (qmckl_context context, @@ -897,15 +901,15 @@ qmckl_set_ao_basis_shell_factor (qmckl_context context, <> } #+end_src - - + + #+begin_src c :comments org :tangle (eval h_func) qmckl_exit_code qmckl_set_ao_basis_exponent (qmckl_context context, const double* exponent, const int64_t size_max); #+end_src - + #+begin_src c :comments org :tangle (eval c) :noweb yes :exports none qmckl_exit_code qmckl_set_ao_basis_exponent (qmckl_context context, @@ -2488,6 +2492,7 @@ free(ao_factor_test); | ~shell_vgl_date~ | ~uint64_t~ | Last modification date of Value, gradients, Laplacian of the AOs at current positions | | ~ao_vgl~ | ~double[point_num][5][ao_num]~ | Value, gradients, Laplacian of the primitives at current positions | | ~ao_vgl_date~ | ~uint64_t~ | Last modification date of Value, gradients, Laplacian of the AOs at current positions | + |----------------------+-----------------------------------+----------------------------------------------------------------------------------------------| *** After initialization @@ -2638,14 +2643,15 @@ qmckl_exit_code qmckl_finalize_basis(qmckl_context context) { } #+end_src -*** HPC-specific data structures +*** TODO HPC-specific data structures + For faster access, we provide extra arrays for the shell information as: - - $C_{psa}$ = =coef_per_nucleus (iprim, ishell, inucl)= - - $\gamma_{pa}$ =expo_per_nucleus (iprim, inucl)= + - $C_{psa}$ = =coef_per_nucleus[inucl][ishell][iprim]= + - $\gamma_{pa}$ =expo_per_nucleus[inucl][iprim]= such that the computation of the radial parts can be vectorized. - + Exponents are sorted in increasing order to exit loops quickly, and only unique exponents are kept. This also allows to pack the exponents to enable vectorization of exponentials. @@ -2655,12 +2661,13 @@ qmckl_exit_code qmckl_finalize_basis(qmckl_context context) { R_{sa} = \sum_p C_{psa} \gamma_{pa} \] which is a matrix-vector product. - - #+NAME:HPC_struct + + #+NAME:HPC_struct #+begin_src c :comments org :exports none /* HPC specific data structures */ - qmckl_tensor coef_per_nucleus; - qmckl_matrix expo_per_nucleus; + int32_t* restrict prim_num_per_nucleus; + qmckl_tensor coef_per_nucleus; + qmckl_matrix expo_per_nucleus; #+end_src #+begin_src c :comments org :tangle (eval h_private_func) :exports none @@ -2668,19 +2675,40 @@ qmckl_exit_code qmckl_finalize_basis(qmckl_context context) { qmckl_exit_code qmckl_finalize_basis_hpc (qmckl_context context); #endif #+end_src - + #+begin_src c :comments org :tangle (eval c) :exports none +/* Data structure for sorting */ +struct combined { + double expo; + int64_t index; +}; + +/* Comparison function */ +int compare_basis( const void * l, const void * r ) +{ + const struct combined * restrict _l= (const struct combined *)l; + const struct combined * restrict _r= (const struct combined *)r; + if( _l->expo > _r->expo ) return 1; + if( _l->expo < _r->expo ) return -1; + return 0; +} + #ifdef HAVE_HPC qmckl_exit_code qmckl_finalize_basis_hpc (qmckl_context context) { qmckl_context_struct* const ctx = (qmckl_context_struct* const) context; + qmckl_memory_info_struct mem_info = qmckl_memory_info_struct_zero; + mem_info.size = ctx->nucleus.num * sizeof(int32_t); + ctx->ao_basis.prim_num_per_nucleus = (int32_t*) qmckl_malloc(context, mem_info); + + /* Find max number of primitives per nucleus */ int64_t shell_max = 0; int64_t prim_max = 0; int64_t nucl_num = ctx->nucleus.num; for (int inucl=0 ; inucl < nucl_num ; ++inucl) { shell_max = ctx->ao_basis.nucleus_shell_num[inucl] > shell_max ? - ctx->ao_basis.nucleus_shell_num[inucl] : shell_max; + ctx->ao_basis.nucleus_shell_num[inucl] : shell_max; int64_t prim_num = 0; const int64_t ishell_start = ctx->ao_basis.nucleus_index[inucl]; @@ -2689,9 +2717,11 @@ qmckl_exit_code qmckl_finalize_basis_hpc (qmckl_context context) prim_num += ctx->ao_basis.shell_prim_num[ishell]; } prim_max = prim_num > prim_max ? - prim_num : prim_max; + prim_num : prim_max; + ctx->ao_basis.prim_num_per_nucleus[inucl] = prim_num; } + int64_t size[3] = { prim_max, shell_max, nucl_num }; ctx->ao_basis.coef_per_nucleus = qmckl_tensor_alloc( context, 3, size ); ctx->ao_basis.coef_per_nucleus = qmckl_tensor_set(ctx->ao_basis.coef_per_nucleus, 0.); @@ -2699,9 +2729,10 @@ qmckl_exit_code qmckl_finalize_basis_hpc (qmckl_context context) ctx->ao_basis.expo_per_nucleus = qmckl_matrix_alloc( context, prim_max, nucl_num ); ctx->ao_basis.expo_per_nucleus = qmckl_matrix_set(ctx->ao_basis.expo_per_nucleus, 0.); - double expo[prim_max]; + struct combined expo[prim_max]; double coef[shell_max][prim_max]; - for (int inucl=0 ; inucl < nucl_num ; ++inucl) { + double newcoef[prim_max]; + for (int64_t inucl=0 ; inucl < nucl_num ; ++inucl) { memset(expo, 0, sizeof(expo)); memset(coef, 0, sizeof(expo)); @@ -2709,31 +2740,93 @@ qmckl_exit_code qmckl_finalize_basis_hpc (qmckl_context context) const int64_t ishell_start = ctx->ao_basis.nucleus_index[inucl]; const int64_t ishell_end = ctx->ao_basis.nucleus_index[inucl] + ctx->ao_basis.nucleus_shell_num[inucl]; for (int64_t ishell = ishell_start ; ishell < ishell_end ; ++ishell) { - + const int64_t iprim_start = ctx->ao_basis.shell_prim_index[ishell]; const int64_t iprim_end = ctx->ao_basis.shell_prim_index[ishell] + ctx->ao_basis.shell_prim_num[ishell]; for (int64_t iprim = iprim_start ; iprim < iprim_end ; ++iprim) { - expo[idx] = ctx->ao_basis.coefficient_normalized[iprim]; - coef[ishell - ishell_start][idx] = ctx->ao_basis.coefficient_normalized[iprim]; + expo[idx].expo = ctx->ao_basis.exponent[iprim]; + expo[idx].index = idx; idx += 1; } } - for (int64_t i=0 ; iao_basis.expo_per_nucleus, i, inucl ) = expo[i]; + /* Sort exponents */ + qsort( expo, (size_t) idx, sizeof(struct combined), compare_basis ); + + idx = 0; + int64_t idx2 = 0; + for (int64_t ishell = ishell_start ; ishell < ishell_end ; ++ishell) { + + memset(newcoef, 0, sizeof(newcoef)); + const int64_t iprim_start = ctx->ao_basis.shell_prim_index[ishell]; + const int64_t iprim_end = ctx->ao_basis.shell_prim_index[ishell] + ctx->ao_basis.shell_prim_num[ishell]; + for (int64_t iprim = iprim_start ; iprim < iprim_end ; ++iprim) { + newcoef[idx] = ctx->ao_basis.coefficient_normalized[iprim]; + idx += 1; + } + for (int32_t i=0 ; iao_basis.prim_num_per_nucleus[inucl] ; ++i) { + idx2 = expo[i].index; + coef[ishell - ishell_start][i] = newcoef[idx2]; + } } - for (int64_t j=0 ; jao_basis.prim_num_per_nucleus[inucl] ; ++i) { + if (expo[i].expo != expo[i-1].expo) { + idx += 1; + } + newidx[i] = idx; + } + idxmax = idx; + + for (int32_t j=0 ; jao_basis.prim_num_per_nucleus[inucl] ; ++i) { + newcoef[newidx[i]] += coef[j][i]; + } + for (int32_t i=0 ; iao_basis.prim_num_per_nucleus[inucl] ; ++i) { + coef[j][i] = newcoef[i]; + } + } + + for (int32_t i=0 ; iao_basis.prim_num_per_nucleus[inucl] ; ++i) { + expo[newidx[i]].expo = expo[i].expo; + } + ctx->ao_basis.prim_num_per_nucleus[inucl] = idxmax+1; + + for (int32_t i=0 ; iao_basis.prim_num_per_nucleus[inucl] ; ++i) { + qmckl_mat( ctx->ao_basis.expo_per_nucleus, i, inucl ) = expo[i].expo; + } + + for (int32_t j=0 ; jao_basis.prim_num_per_nucleus[inucl] ; ++i) { qmckl_ten3( ctx->ao_basis.coef_per_nucleus, i, j, inucl ) = coef[j][i]; } } +/* + for (int32_t i=0 ; iao_basis.prim_num_per_nucleus[inucl] ; ++i) { + printf("%4ld %4ld %15.5e | ", inucl, i, qmckl_mat( ctx->ao_basis.expo_per_nucleus, i, inucl )); + for (int64_t j=0 ; jao_basis.coef_per_nucleus, i, j, inucl )); + } + printf("\n"); + } + printf("\n"); +*/ } + + return QMCKL_SUCCESS; } #endif - + #+end_src *** Access functions @@ -4700,13 +4793,13 @@ end function qmckl_ao_polynomial_transp_vgl_doc_f #+begin_src c :tangle (eval c) :comments org qmckl_exit_code qmckl_ao_polynomial_transp_vgl_hpc (const qmckl_context context, - const double* X, - const double* R, + const double* restrict X, + const double* restrict R, const int32_t lmax, - int64_t* n, - int32_t* const L, + int64_t* restrict n, + int32_t* restrict const L, const int64_t ldl, - double* const VGL, + double* restrict const VGL, const int64_t ldv ) { const qmckl_context_struct* ctx = (qmckl_context_struct* const) context; @@ -4742,7 +4835,7 @@ qmckl_ao_polynomial_transp_vgl_hpc (const qmckl_context context, } else { - int32_t* const l[4] = {L, L+ldl, L+(ldl<<1), L+ldl+(ldl<<1)}; + int32_t* restrict const l[4] = {L, L+ldl, L+(ldl<<1), L+ldl+(ldl<<1)}; l[0][0] = 0; l[0][1] = 0; l[0][2] = 0; l[1][0] = 1; l[1][1] = 0; l[1][2] = 0; l[2][0] = 0; l[2][1] = 1; l[2][2] = 0; @@ -4763,11 +4856,11 @@ qmckl_ao_polynomial_transp_vgl_hpc (const qmckl_context context, } else { - double* const vgl1 = VGL; - double* const vgl2 = VGL + ldv; - double* const vgl3 = VGL + (ldv << 1); - double* const vgl4 = VGL + ldv + (ldv << 1); - double* const vgl5 = VGL + (ldv << 2); + double* restrict const vgl1 = VGL; + double* restrict const vgl2 = VGL + ldv; + double* restrict const vgl3 = VGL + (ldv << 1); + double* restrict const vgl4 = VGL + ldv + (ldv << 1); + double* restrict const vgl5 = VGL + (ldv << 2); for (int32_t k=0 ; k<4 ; ++k) { vgl2[k] = 0.0; @@ -4797,7 +4890,7 @@ qmckl_ao_polynomial_transp_vgl_hpc (const qmckl_context context, L[i] = ltmp[i]; } else { - int32_t* l[10]; + int32_t* restrict l[10]; for (int32_t i=0 ; i<10 ; ++i) { l[i] = L + i*ldl; } @@ -4833,11 +4926,11 @@ qmckl_ao_polynomial_transp_vgl_hpc (const qmckl_context context, for (int i=0 ; i<50 ; ++i) VGL[i] = tmp[i]; } else { - double* const vgl1 = VGL; - double* const vgl2 = VGL + ldv; - double* const vgl3 = VGL + (ldv << 1); - double* const vgl4 = VGL + 3*ldv; - double* const vgl5 = VGL + (ldv << 2); + double* restrict const vgl1 = VGL; + double* restrict const vgl2 = VGL + ldv; + double* restrict const vgl3 = VGL + (ldv << 1); + double* restrict const vgl4 = VGL + 3*ldv; + double* restrict const vgl5 = VGL + (ldv << 2); vgl1[0] = 1.0 ; vgl1[1] = Y[0] ; vgl1[2] = Y[1]; vgl1[3] = Y[2] ; vgl1[4] = Y[0]*Y[0]; vgl1[5] = Y[0]*Y[1]; @@ -4872,11 +4965,11 @@ qmckl_ao_polynomial_transp_vgl_hpc (const qmckl_context context, const int32_t size_max = (lmax+1)*(lmax+2)*(lmax+3)/6; if (ldv < size_max) return QMCKL_INVALID_ARG_9; - double* const vgl1 = VGL; - double* const vgl2 = VGL + ldv; - double* const vgl3 = VGL + (ldv<<1); - double* const vgl4 = VGL + ldv + (ldv<<1); - double* const vgl5 = VGL + (ldv<<2); + double* restrict const vgl1 = VGL; + double* restrict const vgl2 = VGL + ldv; + double* restrict const vgl3 = VGL + (ldv<<1); + double* restrict const vgl4 = VGL + ldv + (ldv<<1); + double* restrict const vgl5 = VGL + (ldv<<2); const double Y[3] = { X[0]-R[0], X[1]-R[1], @@ -5375,29 +5468,28 @@ end function qmckl_compute_ao_vgl_doc_f | ~coef_normalized~ | ~double[prim_num]~ | in | Value, gradients and Laplacian of the shells | | ~ao_vgl~ | ~double[point_num][5][ao_num]~ | out | Value, gradients and Laplacian of the AOs | + #+begin_src c :comments org :tangle (eval c) :noweb yes :exports none #ifdef HAVE_HPC qmckl_exit_code qmckl_compute_ao_vgl_hpc_gaussian ( - const qmckl_context context, - const int64_t ao_num, - const int64_t shell_num, - const int64_t prim_num, - const int64_t point_num, - const int64_t nucl_num, - const double* coord, - const double* nucl_coord, - const int64_t* nucleus_index, - const int64_t* nucleus_shell_num, - const double* nucleus_range, - const int32_t* nucleus_max_ang_mom, - const int32_t* shell_ang_mom, - const int64_t* shell_prim_index, - const int64_t* shell_prim_num, - const double* ao_factor, - const double* expo, - const double* coef_normalized, - double* const ao_vgl ) + const qmckl_context context, + const int64_t ao_num, + const int64_t shell_num, + const int32_t* restrict prim_num_per_nucleus, + const int64_t point_num, + const int64_t nucl_num, + const double* restrict coord, + const double* restrict nucl_coord, + const int64_t* restrict nucleus_index, + const int64_t* restrict nucleus_shell_num, + const double* nucleus_range, + const int32_t* restrict nucleus_max_ang_mom, + const int32_t* restrict shell_ang_mom, + const double* restrict ao_factor, + const qmckl_matrix expo_per_nucleus, + const qmckl_tensor coef_per_nucleus, + double* restrict const ao_vgl ) { int32_t lstart[32]; for (int32_t l=0 ; l<32 ; ++l) { @@ -5406,37 +5498,41 @@ qmckl_compute_ao_vgl_hpc_gaussian ( int64_t ao_index[shell_num+1]; int64_t size_max = 0; + int64_t prim_max = 0; + int64_t shell_max = 0; { - int64_t k=0; - for (int inucl=0 ; inucl < nucl_num ; ++inucl) { - const int64_t ishell_start = nucleus_index[inucl]; - const int64_t ishell_end = nucleus_index[inucl] + nucleus_shell_num[inucl]; - for (int64_t ishell = ishell_start ; ishell < ishell_end ; ++ishell) { - const int l = shell_ang_mom[ishell]; - ao_index[ishell] = k; - k += lstart[l+1] - lstart[l]; - size_max = size_max < lstart[l+1] ? lstart[l+1] : size_max; - } - } - ao_index[shell_num] = ao_num+1; + int64_t k=0; + for (int inucl=0 ; inucl < nucl_num ; ++inucl) { + prim_max = prim_num_per_nucleus[inucl] > prim_max ? + prim_num_per_nucleus[inucl] : prim_max; + shell_max = nucleus_shell_num[inucl] > shell_max ? + nucleus_shell_num[inucl] : shell_max; + const int64_t ishell_start = nucleus_index[inucl]; + const int64_t ishell_end = nucleus_index[inucl] + nucleus_shell_num[inucl]; + for (int64_t ishell = ishell_start ; ishell < ishell_end ; ++ishell) { + const int l = shell_ang_mom[ishell]; + ao_index[ishell] = k; + k += lstart[l+1] - lstart[l]; + size_max = size_max < lstart[l+1] ? lstart[l+1] : size_max; + } + } + ao_index[shell_num] = ao_num+1; } /* Don't compute polynomials when the radial part is zero. */ double cutoff = -log(1.e-12); #ifdef HAVE_OPENMP - #pragma omp parallel +#pragma omp parallel #endif { qmckl_exit_code rc; - double c_[prim_num]; - double expo_[prim_num]; - double ar2[prim_num]; - int32_t powers[prim_num]; + double ar2[prim_max]; + int32_t powers[prim_max]; double poly_vgl_l1[4][4] = {{1.0, 0.0, 0.0, 0.0}, - {0.0, 1.0, 0.0, 0.0}, - {0.0, 0.0, 1.0, 0.0}, - {0.0, 0.0, 0.0, 1.0}}; + {0.0, 1.0, 0.0, 0.0}, + {0.0, 0.0, 1.0, 0.0}, + {0.0, 0.0, 0.0, 1.0}}; double poly_vgl_l2[5][10] = {{1., 0., 0., 0., 0., 0., 0., 0., 0., 0.}, {0., 1., 0., 0., 0., 0., 0., 0., 0., 0.}, {0., 0., 1., 0., 0., 0., 0., 0., 0., 0.}, @@ -5444,216 +5540,247 @@ qmckl_compute_ao_vgl_hpc_gaussian ( {0., 0., 0., 0., 2., 0., 0., 2., 0., 2.}}; double poly_vgl[5][size_max]; + double exp_mat[prim_max][8]; + double ce_mat[shell_max][8]; + + double coef_mat[nucl_num][shell_max][prim_max]; + for (int i=0 ; i cutoff * nucleus_range[inucl]) { - continue; + if (r2 > cutoff * nucleus_range[inucl]) { + continue; + } + + int64_t n_poly; + switch (nucleus_max_ang_mom[inucl]) { + case 0: + break; + + case 1: + poly_vgl_l1[0][1] = x; + poly_vgl_l1[0][2] = y; + poly_vgl_l1[0][3] = z; + break; + + case 2: + poly_vgl_l2[0][1] = x; + poly_vgl_l2[0][2] = y; + poly_vgl_l2[0][3] = z; + poly_vgl_l2[0][4] = x*x; + poly_vgl_l2[0][5] = x*y; + poly_vgl_l2[0][6] = x*z; + poly_vgl_l2[0][7] = y*y; + poly_vgl_l2[0][8] = y*z; + poly_vgl_l2[0][9] = z*z; + poly_vgl_l2[1][4] = x+x; + poly_vgl_l2[1][5] = y; + poly_vgl_l2[1][6] = z; + poly_vgl_l2[2][5] = x; + poly_vgl_l2[2][7] = y+y; + poly_vgl_l2[2][8] = z; + poly_vgl_l2[3][6] = x; + poly_vgl_l2[3][8] = y; + poly_vgl_l2[3][9] = z+z; + break; + + default: + rc = qmckl_ao_polynomial_transp_vgl_hpc(context, e_coord, n_coord, + nucleus_max_ang_mom[inucl], + &n_poly, powers, (int64_t) 3, + &(poly_vgl[0][0]), size_max); + + assert (rc == QMCKL_SUCCESS); + break; + } + + /* Compute all exponents */ + + int64_t nidx = 0; + for (int64_t iprim = 0 ; iprim < prim_num_per_nucleus[inucl] ; ++iprim) { + const double v = qmckl_mat(expo_per_nucleus, iprim, inucl) * r2; + if (v <= cutoff) { + ar2[iprim] = v; + ++nidx; + } else { + break; } + } - int64_t n_poly; - switch (nucleus_max_ang_mom[inucl]) { - case 0: - break; + for (int64_t iprim = 0 ; iprim < nidx ; ++iprim) { + exp_mat[iprim][0] = exp(-ar2[iprim]); + } + for (int64_t iprim = 0 ; iprim < nidx ; ++iprim) { + double f = qmckl_mat(expo_per_nucleus, iprim, inucl) * exp_mat[iprim][0]; + f = -f-f; + exp_mat[iprim][1] = f * x; + exp_mat[iprim][2] = f * y; + exp_mat[iprim][3] = f * z; + exp_mat[iprim][4] = f * (3.0 - 2.0 * ar2[iprim]); + } - case 1: - poly_vgl_l1[0][1] = x; - poly_vgl_l1[0][2] = y; - poly_vgl_l1[0][3] = z; - break; + for (int i=0 ; i 0) { + const double* restrict f = ao_factor + k; + const int64_t idx = lstart[l]; + + switch (nucleus_max_ang_mom[inucl]) { + case 0: break; + case 1: + poly_vgl_1 = &(poly_vgl_l1[0][idx]); + poly_vgl_2 = &(poly_vgl_l1[1][idx]); + poly_vgl_3 = &(poly_vgl_l1[2][idx]); + poly_vgl_4 = &(poly_vgl_l1[3][idx]); + break; + case 2: + poly_vgl_1 = &(poly_vgl_l2[0][idx]); + poly_vgl_2 = &(poly_vgl_l2[1][idx]); + poly_vgl_3 = &(poly_vgl_l2[2][idx]); + poly_vgl_4 = &(poly_vgl_l2[3][idx]); + poly_vgl_5 = &(poly_vgl_l2[4][idx]); + break; + default: + poly_vgl_1 = &(poly_vgl[0][idx]); + poly_vgl_2 = &(poly_vgl[1][idx]); + poly_vgl_3 = &(poly_vgl[2][idx]); + poly_vgl_4 = &(poly_vgl[3][idx]); + poly_vgl_5 = &(poly_vgl[4][idx]); + } + switch (n) { + case(1): + ao_vgl_1[0] = s1 * f[0]; + ao_vgl_2[0] = s2 * f[0]; + ao_vgl_3[0] = s3 * f[0]; + ao_vgl_4[0] = s4 * f[0]; + ao_vgl_5[0] = s5; + break; + case (3): +#ifdef HAVE_OPENMP +#pragma omp simd +#endif + for (int il=0 ; il<3 ; ++il) { + ao_vgl_1[il] = poly_vgl_1[il] * s1 * f[il]; + ao_vgl_2[il] = (poly_vgl_2[il] * s1 + poly_vgl_1[il] * s2) * f[il]; + ao_vgl_3[il] = (poly_vgl_3[il] * s1 + poly_vgl_1[il] * s3) * f[il]; + ao_vgl_4[il] = (poly_vgl_4[il] * s1 + poly_vgl_1[il] * s4) * f[il]; + ao_vgl_5[il] = (poly_vgl_1[il] * s5 + + 2.0*(poly_vgl_2[il] * s2 + + poly_vgl_3[il] * s3 + + poly_vgl_4[il] * s4 )) * f[il]; + } + break; + case(6): +#ifdef HAVE_OPENMP +#pragma omp simd +#endif + for (int il=0 ; il<6 ; ++il) { + ao_vgl_1[il] = poly_vgl_1[il] * s1 * f[il]; + ao_vgl_2[il] = (poly_vgl_2[il] * s1 + poly_vgl_1[il] * s2) * f[il]; + ao_vgl_3[il] = (poly_vgl_3[il] * s1 + poly_vgl_1[il] * s3) * f[il]; + ao_vgl_4[il] = (poly_vgl_4[il] * s1 + poly_vgl_1[il] * s4) * f[il]; + ao_vgl_5[il] = (poly_vgl_5[il] * s1 + poly_vgl_1[il] * s5 + + 2.0*(poly_vgl_2[il] * s2 + + poly_vgl_3[il] * s3 + + poly_vgl_4[il] * s4 )) * f[il]; + } + break; + default: +#ifdef HAVE_OPENMP +#pragma omp simd simdlen(8) +#endif + for (int il=0 ; il 0) { - const double* __restrict__ f = ao_factor + k; - const int64_t idx = lstart[l]; - - switch (nucleus_max_ang_mom[inucl]) { - case 0: - break; - case 1: - poly_vgl_1 = &(poly_vgl_l1[0][idx]); - poly_vgl_2 = &(poly_vgl_l1[1][idx]); - poly_vgl_3 = &(poly_vgl_l1[2][idx]); - poly_vgl_4 = &(poly_vgl_l1[3][idx]); - break; - case 2: - poly_vgl_1 = &(poly_vgl_l2[0][idx]); - poly_vgl_2 = &(poly_vgl_l2[1][idx]); - poly_vgl_3 = &(poly_vgl_l2[2][idx]); - poly_vgl_4 = &(poly_vgl_l2[3][idx]); - break; - default: - poly_vgl_1 = &(poly_vgl[0][idx]); - poly_vgl_2 = &(poly_vgl[1][idx]); - poly_vgl_3 = &(poly_vgl[2][idx]); - poly_vgl_4 = &(poly_vgl[3][idx]); - poly_vgl_5 = &(poly_vgl[4][idx]); - } - switch (n) { - case(1): - ao_vgl_1[0] = s1 * f[0]; - ao_vgl_2[0] = s2 * f[0]; - ao_vgl_3[0] = s3 * f[0]; - ao_vgl_4[0] = s4 * f[0]; - ao_vgl_5[0] = s5; - break; - case (3): -#ifdef HAVE_OPENMP - #pragma omp simd -#endif - for (int il=0 ; il<3 ; ++il) { - ao_vgl_1[il] = poly_vgl_1[il] * s1 * f[il]; - ao_vgl_2[il] = (poly_vgl_2[il] * s1 + poly_vgl_1[il] * s2) * f[il]; - ao_vgl_3[il] = (poly_vgl_3[il] * s1 + poly_vgl_1[il] * s3) * f[il]; - ao_vgl_4[il] = (poly_vgl_4[il] * s1 + poly_vgl_1[il] * s4) * f[il]; - ao_vgl_5[il] = (poly_vgl_1[il] * s5 + - 2.0*(poly_vgl_2[il] * s2 + - poly_vgl_3[il] * s3 + - poly_vgl_4[il] * s4 )) * f[il]; - } - break; - case(5): -#ifdef HAVE_OPENMP - #pragma omp simd -#endif - for (int il=0 ; il<5 ; ++il) { - ao_vgl_1[il] = poly_vgl_1[il] * s1 * f[il]; - ao_vgl_2[il] = (poly_vgl_2[il] * s1 + poly_vgl_1[il] * s2) * f[il]; - ao_vgl_3[il] = (poly_vgl_3[il] * s1 + poly_vgl_1[il] * s3) * f[il]; - ao_vgl_4[il] = (poly_vgl_4[il] * s1 + poly_vgl_1[il] * s4) * f[il]; - ao_vgl_5[il] = (poly_vgl_1[il] * s5 + - 2.0*(poly_vgl_2[il] * s2 + - poly_vgl_3[il] * s3 + - poly_vgl_4[il] * s4 )) * f[il]; - } - break; - default: -#ifdef HAVE_OPENMP - #pragma omp simd simdlen(8) -#endif - for (int il=0 ; ilao_basis.ao_num, ctx->ao_basis.shell_num, - ctx->ao_basis.prim_num, + ctx->ao_basis.prim_num_per_nucleus, ctx->point.num, ctx->nucleus.num, ctx->point.coord.data, @@ -5840,11 +5965,9 @@ qmckl_exit_code qmckl_provide_ao_vgl(qmckl_context context) ctx->ao_basis.nucleus_range, ctx->ao_basis.nucleus_max_ang_mom, ctx->ao_basis.shell_ang_mom, - ctx->ao_basis.shell_prim_index, - ctx->ao_basis.shell_prim_num, ctx->ao_basis.ao_factor, - ctx->ao_basis.exponent, - ctx->ao_basis.coefficient_normalized, + ctx->ao_basis.expo_per_nucleus, + ctx->ao_basis.coef_per_nucleus, ctx->ao_basis.ao_vgl); /* } else if (ctx->ao_basis.type == 'S') { @@ -5919,77 +6042,74 @@ qmckl_exit_code qmckl_provide_ao_vgl(qmckl_context context) import numpy as np from math import sqrt +h0 = 1.e-4 def f(a,x,y): return np.sum( [c * np.exp( -b*(np.linalg.norm(x-y))**2) for b,c in a] ) +def fx(a,x,y): + return f(a,x,y) * (x[0] - y[0])**2 + + def df(a,x,y,n): - h0 = 1.e-6 - if n == 1: h = np.array([h0,0.,0.]) - elif n == 2: h = np.array([0.,h0,0.]) - elif n == 3: h = np.array([0.,0.,h0]) - return ( f(a,x+h,y) - f(a,x-h,y) ) / (2.*h0) + if n == 1: h = np.array([h0,0.,0.]) + elif n == 2: h = np.array([0.,h0,0.]) + elif n == 3: h = np.array([0.,0.,h0]) + return ( fx(a,x+h,y) - fx(a,x-h,y) ) / (2.*h0) +# return np.sum( [-2.*b * c * np.exp( -b*(np.linalg.norm(x-y))**2) for b,c in a] ) * (x-y)[n-1] def d2f(a,x,y,n): - h0 = 1.e-6 if n == 1: h = np.array([h0,0.,0.]) elif n == 2: h = np.array([0.,h0,0.]) elif n == 3: h = np.array([0.,0.,h0]) - return ( f(a,x+h,y) - 2.*f(a,x,y) + f(a,x-h,y) ) / h0**2 + return ( fx(a,x+h,y) - 2.*fx(a,x,y) + fx(a,x-h,y) ) / h0**2 +# return np.sum( [( (2.*b*(x-y)[n-1])**2 -2.*b ) * c * np.exp( -b*(np.linalg.norm(x-y))**2) for b,c in a] ) def lf(a,x,y): +# return np.sum( [( (2.*b*np.linalg.norm(x-y))**2 -6.*b ) * c * np.exp( -b*(np.linalg.norm(x-y))**2) for b,c in a] ) return d2f(a,x,y,1) + d2f(a,x,y,2) + d2f(a,x,y,3) + elec_26_w1 = np.array( [ 1.49050402641, 2.90106987953, -1.05920815468 ] ) elec_15_w2 = np.array( [ -2.20180344582,-1.9113150239, 2.2193744778600002 ] ) nucl_1 = np.array( [ -2.302574592081335e+00, -3.542027060505035e-01, -5.334129934317614e-02] ) #double ao_vgl[prim_num][5][elec_num]; x = elec_26_w1 ; y = nucl_1 -a = [( 403.830000, 0.001473 * 5.9876577632594533e+04), - ( 121.170000, 0.012672 * 7.2836806319891484e+03), - ( 46.345000, 0.058045 * 1.3549226646722386e+03), - ( 19.721000, 0.170510 * 3.0376315094739988e+02), - ( 8.862400, 0.318596 * 7.4924579607137730e+01), - ( 3.996200, 0.384502 * 1.8590543353806009e+01), - ( 1.763600, 0.273774 * 4.4423176930919421e+00), - ( 0.706190, 0.074397 * 8.9541051939952665e-01)] +a = [( 4.0382999999999998e+02, 1.4732000000000000e-03 * 5.9876577632594533e+04), + ( 1.2117000000000000e+02, 1.2672500000000000e-02 * 7.2836806319891484e+03), + ( 4.6344999999999999e+01, 5.8045100000000002e-02 * 1.3549226646722386e+03), + ( 1.9721000000000000e+01, 1.7051030000000000e-01 * 3.0376315094739988e+02), + ( 8.8623999999999992e+00, 3.1859579999999998e-01 * 7.4924579607137730e+01), + ( 3.9962000000000000e+00, 3.8450230000000002e-01 * 1.8590543353806009e+01), + ( 1.7636000000000001e+00, 2.7377370000000001e-01 * 4.4423176930919421e+00), + ( 7.0618999999999998e-01, 7.4396699999999996e-02 * 8.9541051939952665e-01)] norm = sqrt(3.) -print ( "[0][26][219] : %25.15e"%(f(a,x,y) * (x[0] - y[0])**2) ) -print ( "[1][26][219] : %25.15e"%(df(a,x,y,1)* (x[0] - y[0]) * (x[1] - y[1]) + 2.*f(a,x,y) * (x[0] - y[0])) ) +# x^2 * g(r) +print ( "[26][0][219] : %25.15e"%(fx(a,x,y)) ) +print ( "[26][1][219] : %25.15e"%(df(a,x,y,1)) ) +print ( "[26][2][219] : %25.15e"%(df(a,x,y,2)) ) +print ( "[26][3][219] : %25.15e"%(df(a,x,y,3)) ) +print ( "[26][4][219] : %25.15e"%(lf(a,x,y)) ) -print ( "[0][26][220] : %25.15e"%(norm*f(a,x,y) * (x[0] - y[0]) * (x[1] - y[1]) )) -print ( "[1][26][220] : %25.15e"%(norm*df(a,x,y,1)* (x[0] - y[0]) * (x[1] - y[1]) + norm*f(a,x,y) * (x[1] - y[1])) ) +print ( "[26][0][220] : %25.15e"%(norm*f(a,x,y) * (x[0] - y[0]) * (x[1] - y[1]) )) +print ( "[26][1][220] : %25.15e"%(norm*df(a,x,y,1)* (x[0] - y[0]) * (x[1] - y[1]) + norm*f(a,x,y) * (x[1] - y[1])) ) -print ( "[0][26][221] : %25.15e"%(norm*f(a,x,y) * (x[0] - y[0]) * (x[2] - y[2])) ) -print ( "[1][26][221] : %25.15e"%(norm*df(a,x,y,1)* (x[0] - y[0]) * (x[2] - y[2]) + norm*f(a,x,y) * (x[2] - y[2])) ) +print ( "[26][0][221] : %25.15e"%(norm*f(a,x,y) * (x[0] - y[0]) * (x[2] - y[2])) ) +print ( "[26][1][221] : %25.15e"%(norm*df(a,x,y,1)* (x[0] - y[0]) * (x[2] - y[2]) + norm*f(a,x,y) * (x[2] - y[2])) ) -print ( "[0][26][222] : %25.15e"%(f(a,x,y) * (x[1] - y[1]) * (x[1] - y[1])) ) -print ( "[1][26][222] : %25.15e"%(df(a,x,y,1)* (x[1] - y[1]) * (x[1] - y[1])) ) +print ( "[26][0][222] : %25.15e"%(f(a,x,y) * (x[1] - y[1]) * (x[1] - y[1])) ) +print ( "[26][1][222] : %25.15e"%(df(a,x,y,1)* (x[1] - y[1]) * (x[1] - y[1])) ) -print ( "[0][26][223] : %25.15e"%(norm*f(a,x,y) * (x[1] - y[1]) * (x[2] - y[2])) ) -print ( "[1][26][223] : %25.15e"%(norm*df(a,x,y,1)* (x[1] - y[1]) * (x[2] - y[2])) ) +print ( "[26][0][223] : %25.15e"%(norm*f(a,x,y) * (x[1] - y[1]) * (x[2] - y[2])) ) +print ( "[26][1][223] : %25.15e"%(norm*df(a,x,y,1)* (x[1] - y[1]) * (x[2] - y[2])) ) -print ( "[0][26][224] : %25.15e"%(f(a,x,y) * (x[2] - y[2]) * (x[2] - y[2])) ) -print ( "[1][26][224] : %25.15e"%(df(a,x,y,1)* (x[2] - y[2]) * (x[2] - y[2])) ) +print ( "[26][0][224] : %25.15e"%(f(a,x,y) * (x[2] - y[2]) * (x[2] - y[2])) ) +print ( "[26][1][224] : %25.15e"%(df(a,x,y,1)* (x[2] - y[2]) * (x[2] - y[2])) ) #+end_src #+RESULTS: - #+begin_example - [0][26][219] : 1.020302912653649e-08 - [1][26][219] : -4.153046808203204e-08 - [0][26][220] : 1.516649653540510e-08 - [1][26][220] : -7.725252615816528e-08 - [0][26][221] : -4.686389780112468e-09 - [1][26][221] : 2.387073693851122e-08 - [0][26][222] : 7.514847283937212e-09 - [1][26][222] : -4.025905373647693e-08 - [0][26][223] : -4.021924592380977e-09 - [1][26][223] : 2.154652944642284e-08 - [0][26][224] : 7.175074806631758e-10 - [1][26][224] : -3.843880138733679e-09 - #+end_example #+begin_src c :tangle (eval c_test) :exports none { @@ -6021,32 +6141,69 @@ rc = qmckl_get_ao_basis_ao_vgl(context, &(ao_vgl[0][0][0]), assert (rc == QMCKL_SUCCESS); printf("\n"); -printf(" ao_vgl ao_vgl[0][26][219] %25.15e\n", ao_vgl[26][0][219]); -printf(" ao_vgl ao_vgl[1][26][219] %25.15e\n", ao_vgl[26][1][219]); -printf(" ao_vgl ao_vgl[0][26][220] %25.15e\n", ao_vgl[26][0][220]); -printf(" ao_vgl ao_vgl[1][26][220] %25.15e\n", ao_vgl[26][1][220]); -printf(" ao_vgl ao_vgl[0][26][221] %25.15e\n", ao_vgl[26][0][221]); -printf(" ao_vgl ao_vgl[1][26][221] %25.15e\n", ao_vgl[26][1][221]); -printf(" ao_vgl ao_vgl[0][26][222] %25.15e\n", ao_vgl[26][0][222]); -printf(" ao_vgl ao_vgl[1][26][222] %25.15e\n", ao_vgl[26][1][222]); -printf(" ao_vgl ao_vgl[0][26][223] %25.15e\n", ao_vgl[26][0][223]); -printf(" ao_vgl ao_vgl[1][26][223] %25.15e\n", ao_vgl[26][1][223]); -printf(" ao_vgl ao_vgl[0][26][224] %25.15e\n", ao_vgl[26][0][224]); -printf(" ao_vgl ao_vgl[1][26][224] %25.15e\n", ao_vgl[26][1][224]); +printf(" ao_vgl ao_vgl[26][0][219] %25.15e\n", ao_vgl[26][0][219]); +printf(" ao_vgl ao_vgl[26][1][219] %25.15e\n", ao_vgl[26][1][219]); +printf(" ao_vgl ao_vgl[26][2][219] %25.15e\n", ao_vgl[26][2][219]); +printf(" ao_vgl ao_vgl[26][3][219] %25.15e\n", ao_vgl[26][3][219]); +printf(" ao_vgl ao_vgl[26][4][219] %25.15e\n", ao_vgl[26][4][219]); +printf(" ao_vgl ao_vgl[26][0][220] %25.15e\n", ao_vgl[26][0][220]); +printf(" ao_vgl ao_vgl[26][1][220] %25.15e\n", ao_vgl[26][1][220]); +printf(" ao_vgl ao_vgl[26][2][220] %25.15e\n", ao_vgl[26][2][220]); +printf(" ao_vgl ao_vgl[26][3][220] %25.15e\n", ao_vgl[26][3][220]); +printf(" ao_vgl ao_vgl[26][4][220] %25.15e\n", ao_vgl[26][4][220]); +printf(" ao_vgl ao_vgl[26][0][221] %25.15e\n", ao_vgl[26][0][221]); +printf(" ao_vgl ao_vgl[26][1][221] %25.15e\n", ao_vgl[26][1][221]); +printf(" ao_vgl ao_vgl[26][2][221] %25.15e\n", ao_vgl[26][2][221]); +printf(" ao_vgl ao_vgl[26][3][221] %25.15e\n", ao_vgl[26][3][221]); +printf(" ao_vgl ao_vgl[26][4][221] %25.15e\n", ao_vgl[26][4][221]); +printf(" ao_vgl ao_vgl[26][0][222] %25.15e\n", ao_vgl[26][0][222]); +printf(" ao_vgl ao_vgl[26][1][222] %25.15e\n", ao_vgl[26][1][222]); +printf(" ao_vgl ao_vgl[26][2][222] %25.15e\n", ao_vgl[26][2][222]); +printf(" ao_vgl ao_vgl[26][3][222] %25.15e\n", ao_vgl[26][3][222]); +printf(" ao_vgl ao_vgl[26][4][222] %25.15e\n", ao_vgl[26][4][222]); +printf(" ao_vgl ao_vgl[26][0][223] %25.15e\n", ao_vgl[26][0][223]); +printf(" ao_vgl ao_vgl[26][1][223] %25.15e\n", ao_vgl[26][1][223]); +printf(" ao_vgl ao_vgl[26][2][223] %25.15e\n", ao_vgl[26][2][223]); +printf(" ao_vgl ao_vgl[26][3][223] %25.15e\n", ao_vgl[26][3][223]); +printf(" ao_vgl ao_vgl[26][4][223] %25.15e\n", ao_vgl[26][4][223]); +printf(" ao_vgl ao_vgl[26][0][224] %25.15e\n", ao_vgl[26][0][224]); +printf(" ao_vgl ao_vgl[26][1][224] %25.15e\n", ao_vgl[26][1][224]); +printf(" ao_vgl ao_vgl[26][2][224] %25.15e\n", ao_vgl[26][2][224]); +printf(" ao_vgl ao_vgl[26][3][224] %25.15e\n", ao_vgl[26][3][224]); +printf(" ao_vgl ao_vgl[26][4][224] %25.15e\n", ao_vgl[26][4][224]); printf("\n"); -assert( fabs(ao_vgl[26][0][219] - ( 1.020298798341620e-08)) < 1.e-14 ); -assert( fabs(ao_vgl[26][1][219] - (-4.928035238010602e-08)) < 1.e-14 ); -assert( fabs(ao_vgl[26][0][220] - ( 1.516643537739178e-08)) < 1.e-14 ); -assert( fabs(ao_vgl[26][1][220] - (-7.725221462603871e-08)) < 1.e-14 ); -assert( fabs(ao_vgl[26][0][221] - (-4.686370882518819e-09)) < 1.e-14 ); -assert( fabs(ao_vgl[26][1][221] - ( 2.387064067626827e-08)) < 1.e-14 ); -assert( fabs(ao_vgl[26][0][222] - ( 7.514816980753531e-09)) < 1.e-14 ); -assert( fabs(ao_vgl[26][1][222] - (-4.025889138635182e-08)) < 1.e-14 ); -assert( fabs(ao_vgl[26][0][223] - (-4.021908374204471e-09)) < 1.e-14 ); -assert( fabs(ao_vgl[26][1][223] - ( 2.154644255710413e-08)) < 1.e-14 ); -assert( fabs(ao_vgl[26][0][224] - ( 7.175045873560788e-10)) < 1.e-14 ); -assert( fabs(ao_vgl[26][1][224] - (-3.843864637762753e-09)) < 1.e-14 ); +assert( fabs(ao_vgl[26][0][219] - ( 1.020298798341620e-08)) < 1.e-14 ); +assert( fabs(ao_vgl[26][1][219] - ( -4.928035238010602e-08)) < 1.e-14 ); +assert( fabs(ao_vgl[26][2][219] - ( -4.691009312035986e-08)) < 1.e-14 ); +assert( fabs(ao_vgl[26][3][219] - ( 1.449504046436699e-08)) < 1.e-14 ); +assert( fabs(ao_vgl[26][4][219] - ( 4.296442111843973e-07)) < 1.e-14 ); +assert( fabs(ao_vgl[26][0][220] - ( 1.516643537739178e-08)) < 1.e-14 ); +assert( fabs(ao_vgl[26][1][220] - ( -7.725221462603871e-08)) < 1.e-14 ); +assert( fabs(ao_vgl[26][2][220] - ( -6.507140835104833e-08)) < 1.e-14 ); +assert( fabs(ao_vgl[26][3][220] - ( 2.154644255710413e-08)) < 1.e-14 ); +assert( fabs(ao_vgl[26][4][220] - ( 6.365449359656352e-07)) < 1.e-14 ); +assert( fabs(ao_vgl[26][0][221] - ( -4.686370882518819e-09)) < 1.e-14 ); +assert( fabs(ao_vgl[26][1][221] - ( 2.387064067626827e-08)) < 1.e-14 ); +assert( fabs(ao_vgl[26][2][221] - ( 2.154644255710412e-08)) < 1.e-14 ); +assert( fabs(ao_vgl[26][3][221] - ( -1.998731863512374e-09)) < 1.e-14 ); +assert( fabs(ao_vgl[26][4][221] - ( -1.966899656441993e-07)) < 1.e-14 ); +assert( fabs(ao_vgl[26][0][222] - ( 7.514816980753531e-09)) < 1.e-14 ); +assert( fabs(ao_vgl[26][1][222] - ( -4.025889138635182e-08)) < 1.e-14 ); +assert( fabs(ao_vgl[26][2][222] - ( -2.993372555126361e-08)) < 1.e-14 ); +assert( fabs(ao_vgl[26][3][222] - ( 1.067604670272904e-08)) < 1.e-14 ); +assert( fabs(ao_vgl[26][4][222] - ( 3.168199650002648e-07)) < 1.e-14 ); +assert( fabs(ao_vgl[26][0][223] - ( -4.021908374204471e-09)) < 1.e-14 ); +assert( fabs(ao_vgl[26][1][223] - ( 2.154644255710413e-08)) < 1.e-14 ); +assert( fabs(ao_vgl[26][2][223] - ( 1.725594944732276e-08)) < 1.e-14 ); +assert( fabs(ao_vgl[26][3][223] - ( -1.715339357718333e-09)) < 1.e-14 ); +assert( fabs(ao_vgl[26][4][223] - ( -1.688020516893476e-07)) < 1.e-14 ); +assert( fabs(ao_vgl[26][0][224] - ( 7.175045873560788e-10)) < 1.e-14 ); +assert( fabs(ao_vgl[26][1][224] - ( -3.843864637762753e-09)) < 1.e-14 ); +assert( fabs(ao_vgl[26][2][224] - ( -3.298857850451910e-09)) < 1.e-14 ); +assert( fabs(ao_vgl[26][3][224] - ( -4.073047518790881e-10)) < 1.e-14 ); +assert( fabs(ao_vgl[26][4][224] - ( 3.153244195820293e-08)) < 1.e-14 ); + } #+end_src @@ -6100,6 +6257,5 @@ assert( fabs(ao_vgl[26][1][224] - (-3.843864637762753e-09)) < 1.e-14 ); # -*- mode: org -*- # vim: syntax=c - * TODO [0/1] Missing features :noexport: - [ ] Error messages to tell what is missing when initializing diff --git a/org/qmckl_blas.org b/org/qmckl_blas.org index 73f9e46..9cd7e18 100644 --- a/org/qmckl_blas.org +++ b/org/qmckl_blas.org @@ -85,7 +85,7 @@ are not intended to be passed to external codes. #+begin_src c :comments org :tangle (eval h_private_type) :exports none typedef struct qmckl_vector { int64_t size; - double* data; + double* restrict data; } qmckl_vector; #+end_src @@ -161,7 +161,7 @@ qmckl_vector_free( qmckl_context context, #+begin_src c :comments org :tangle (eval h_private_type) :exports none typedef struct qmckl_matrix { int64_t size[2]; - double* data; + double* restrict data; } qmckl_matrix; #+end_src @@ -247,7 +247,7 @@ qmckl_matrix_free( qmckl_context context, typedef struct qmckl_tensor { int64_t order; int64_t size[QMCKL_TENSOR_ORDER_MAX]; - double* data; + double* restrict data; } qmckl_tensor; #+end_src diff --git a/org/qmckl_determinant.org b/org/qmckl_determinant.org index a28f49c..0412db6 100644 --- a/org/qmckl_determinant.org +++ b/org/qmckl_determinant.org @@ -1239,9 +1239,9 @@ assert(rc == QMCKL_SUCCESS); assert(qmckl_ao_basis_provided(context)); -double ao_vgl[5][walk_num][elec_num][chbrclf_ao_num]; +double ao_vgl[walk_num*elec_num][5][chbrclf_ao_num]; -rc = qmckl_get_ao_basis_ao_vgl(context, &(ao_vgl[0][0][0][0]), (int64_t) 5*walk_num*elec_num*chbrclf_ao_num); +rc = qmckl_get_ao_basis_ao_vgl(context, &(ao_vgl[0][0][0]), (int64_t) 5*walk_num*elec_num*chbrclf_ao_num); assert (rc == QMCKL_SUCCESS); /* Set up MO data */ @@ -1256,8 +1256,8 @@ assert (rc == QMCKL_SUCCESS); assert(qmckl_mo_basis_provided(context)); -double mo_vgl[5][elec_num][chbrclf_mo_num]; -rc = qmckl_get_mo_basis_vgl(context, &(mo_vgl[0][0][0])); +double mo_vgl[walk_num*elec_num][5][chbrclf_mo_num]; +rc = qmckl_get_mo_basis_mo_vgl(context, &(mo_vgl[0][0][0]), 5*walk_num*elec_num*chbrclf_mo_num); assert (rc == QMCKL_SUCCESS); /* Set up determinant data */ diff --git a/org/qmckl_local_energy.org b/org/qmckl_local_energy.org index ccd933d..c5018c0 100644 --- a/org/qmckl_local_energy.org +++ b/org/qmckl_local_energy.org @@ -655,9 +655,9 @@ assert(rc == QMCKL_SUCCESS); assert(qmckl_ao_basis_provided(context)); -double ao_vgl[5][walk_num][elec_num][chbrclf_ao_num]; +double ao_vgl[walk_num*elec_num][5][chbrclf_ao_num]; -rc = qmckl_get_ao_basis_ao_vgl(context, &(ao_vgl[0][0][0][0]), +rc = qmckl_get_ao_basis_ao_vgl(context, &(ao_vgl[0][0][0]), (int64_t) 5*walk_num*elec_num*chbrclf_ao_num); assert (rc == QMCKL_SUCCESS); @@ -673,8 +673,8 @@ assert (rc == QMCKL_SUCCESS); assert(qmckl_mo_basis_provided(context)); -double mo_vgl[5][elec_num][chbrclf_mo_num]; -rc = qmckl_get_mo_basis_vgl(context, &(mo_vgl[0][0][0])); +double mo_vgl[walk_num*elec_num][5][chbrclf_mo_num]; +rc = qmckl_get_mo_basis_mo_vgl(context, &(mo_vgl[0][0][0]), 5*walk_num*elec_num*chbrclf_mo_num); assert (rc == QMCKL_SUCCESS); /* Set up determinant data */ diff --git a/org/qmckl_mo.org b/org/qmckl_mo.org index d4efb41..4f8bb57 100644 --- a/org/qmckl_mo.org +++ b/org/qmckl_mo.org @@ -84,13 +84,14 @@ int main() { The following arrays are stored in the context: - |---------------+--------------------+----------------------| - | ~mo_num~ | | Number of MOs | - | ~coefficient~ | ~[mo_num][ao_num]~ | Orbital coefficients | + |-----------------+--------------------+----------------------------------------| + | ~mo_num~ | | Number of MOs | + | ~coefficient~ | ~[mo_num][ao_num]~ | Orbital coefficients | + | ~coefficient_t~ | ~[ao_num][mo_num]~ | Transposed of the Orbital coefficients | + |-----------------+--------------------+----------------------------------------| Computed data: - |---------------+--------------------------+-------------------------------------------------------------------------------------| |---------------+--------------------------+-------------------------------------------------------------------------------------| | ~mo_vgl~ | ~[point_num][5][mo_num]~ | Value, gradients, Laplacian of the MOs at point positions | | ~mo_vgl_date~ | ~uint64_t~ | Late modification date of Value, gradients, Laplacian of the MOs at point positions | @@ -101,9 +102,10 @@ int main() { #+begin_src c :comments org :tangle (eval h_private_type) typedef struct qmckl_mo_basis_struct { int64_t mo_num; - double * coefficient; + double * restrict coefficient; + double * restrict coefficient_t; - double * mo_vgl; + double * restrict mo_vgl; uint64_t mo_vgl_date; int32_t uninitialized; @@ -355,6 +357,34 @@ qmckl_exit_code qmckl_finalize_mo_basis(qmckl_context context) { qmckl_context_struct* const ctx = (qmckl_context_struct* const) context; assert (ctx != NULL); + qmckl_memory_info_struct mem_info = qmckl_memory_info_struct_zero; + mem_info.size = ctx->ao_basis.ao_num * ctx->mo_basis.mo_num * sizeof(double); + double* new_array = (double*) qmckl_malloc(context, mem_info); + if (new_array == NULL) { + return qmckl_failwith( context, + QMCKL_ALLOCATION_FAILED, + "qmckl_finalize_mo_basis", + NULL); + } + + assert (ctx->mo_basis.coefficient != NULL); + + if (ctx->mo_basis.coefficient_t != NULL) { + qmckl_exit_code rc = qmckl_free(context, ctx->mo_basis.coefficient); + if (rc != QMCKL_SUCCESS) { + return qmckl_failwith( context, rc, + "qmckl_finalize_mo_basis", + NULL); + } + } + + for (int64_t i=0 ; iao_basis.ao_num ; ++i) { + for (int64_t j=0 ; jmo_basis.mo_num ; ++j) { + new_array[i*ctx->mo_basis.mo_num + j] = ctx->mo_basis.coefficient[j*ctx->ao_basis.ao_num + i]; + } + } + + ctx->mo_basis.coefficient_t = new_array; qmckl_exit_code rc = QMCKL_SUCCESS; return rc; } @@ -367,11 +397,18 @@ qmckl_exit_code qmckl_finalize_mo_basis(qmckl_context context) { *** Get #+begin_src c :comments org :tangle (eval h_func) :noweb yes -qmckl_exit_code qmckl_get_mo_basis_vgl(qmckl_context context, double* const mo_vgl); +qmckl_exit_code +qmckl_get_mo_basis_mo_vgl(qmckl_context context, + double* const mo_vgl, + const int64_t size_max); #+end_src #+begin_src c :comments org :tangle (eval c) :noweb yes :exports none -qmckl_exit_code qmckl_get_mo_basis_vgl(qmckl_context context, double* const mo_vgl) { +qmckl_exit_code +qmckl_get_mo_basis_mo_vgl(qmckl_context context, + double* const mo_vgl, + const int64_t size_max) +{ if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) { return QMCKL_NULL_CONTEXT; @@ -388,7 +425,13 @@ qmckl_exit_code qmckl_get_mo_basis_vgl(qmckl_context context, double* const mo_v qmckl_context_struct* const ctx = (qmckl_context_struct* const) context; assert (ctx != NULL); - size_t sze = 5 * ctx->point.num * ctx->mo_basis.mo_num; + const int64_t sze = ctx->point.num * 5 * ctx->mo_basis.mo_num; + if (size_max < sze) { + return qmckl_failwith( context, + QMCKL_INVALID_ARG_3, + "qmckl_get_mo_basis_mo_vgl", + "input array too small"); + } memcpy(mo_vgl, ctx->mo_basis.mo_vgl, sze * sizeof(double)); return QMCKL_SUCCESS; @@ -396,17 +439,84 @@ qmckl_exit_code qmckl_get_mo_basis_vgl(qmckl_context context, double* const mo_v #+end_src #+begin_src f90 :tangle (eval fh_func) :comments org :exports none - interface - integer(c_int32_t) function qmckl_get_mo_basis_vgl (context, mo_vgl) & - bind(C) - use, intrinsic :: iso_c_binding - import - implicit none + interface + integer(c_int32_t) function qmckl_get_mo_basis_mo_vgl (context, & + mo_vgl, size_max) bind(C) + use, intrinsic :: iso_c_binding + import + implicit none + + integer (c_int64_t) , intent(in) , value :: context + double precision, intent(out) :: mo_vgl(*) + integer (c_int64_t) , intent(in) , value :: size_max + end function qmckl_get_mo_basis_mo_vgl + end interface + #+end_src - integer (c_int64_t) , intent(in) , value :: context - double precision, intent(out) :: mo_vgl(*) - end function - end interface + Uses the given array to compute the VGL. + + #+begin_src c :comments org :tangle (eval h_func) :noweb yes +qmckl_exit_code +qmckl_get_mo_basis_mo_vgl_inplace (qmckl_context context, + double* const mo_vgl, + const int64_t size_max); + #+end_src + + #+begin_src c :comments org :tangle (eval c) :noweb yes :exports none +qmckl_exit_code +qmckl_get_mo_basis_mo_vgl_inplace (qmckl_context context, + double* const mo_vgl, + const int64_t size_max) +{ + + if (qmckl_context_check(context) == QMCKL_NULL_CONTEXT) { + return qmckl_failwith( context, + QMCKL_INVALID_CONTEXT, + "qmckl_get_mo_basis_mo_vgl", + NULL); + } + + qmckl_exit_code rc; + + qmckl_context_struct* const ctx = (qmckl_context_struct* const) context; + assert (ctx != NULL); + + const int64_t sze = ctx->mo_basis.mo_num * 5 * ctx->point.num; + if (size_max < sze) { + return qmckl_failwith( context, + QMCKL_INVALID_ARG_3, + "qmckl_get_mo_basis_mo_vgl", + "input array too small"); + } + + rc = qmckl_context_touch(context); + if (rc != QMCKL_SUCCESS) return rc; + + double* old_array = ctx->mo_basis.mo_vgl; + + ctx->mo_basis.mo_vgl = mo_vgl; + + rc = qmckl_provide_mo_vgl(context); + if (rc != QMCKL_SUCCESS) return rc; + + ctx->mo_basis.mo_vgl = old_array; + + return QMCKL_SUCCESS; +} + #+end_src + + #+begin_src f90 :tangle (eval fh_func) :comments org :exports none + interface + integer(c_int32_t) function qmckl_get_mo_basis_mo_vgl_inplace (context, & + mo_vgl, size_max) bind(C) + use, intrinsic :: iso_c_binding + import + implicit none + integer (c_int64_t) , intent(in) , value :: context + double precision, intent(out) :: mo_vgl(*) + integer (c_int64_t) , intent(in) , value :: size_max + end function qmckl_get_mo_basis_mo_vgl_inplace + end interface #+end_src *** Provide @@ -462,19 +572,19 @@ qmckl_exit_code qmckl_provide_mo_vgl(qmckl_context context) if (mo_vgl == NULL) { return qmckl_failwith( context, QMCKL_ALLOCATION_FAILED, - "qmckl_mo_basis_vgl", + "qmckl_mo_basis_mo_vgl", NULL); } ctx->mo_basis.mo_vgl = mo_vgl; } - rc = qmckl_compute_mo_basis_vgl(context, - ctx->ao_basis.ao_num, - ctx->mo_basis.mo_num, - ctx->point.num, - ctx->mo_basis.coefficient, - ctx->ao_basis.ao_vgl, - ctx->mo_basis.mo_vgl); + rc = qmckl_compute_mo_basis_mo_vgl(context, + ctx->ao_basis.ao_num, + ctx->mo_basis.mo_num, + ctx->point.num, + ctx->mo_basis.coefficient_t, + ctx->ao_basis.ao_vgl, + ctx->mo_basis.mo_vgl); if (rc != QMCKL_SUCCESS) { return rc; } @@ -488,25 +598,33 @@ qmckl_exit_code qmckl_provide_mo_vgl(qmckl_context context) *** Compute :PROPERTIES: - :Name: qmckl_compute_mo_basis_vgl + :Name: qmckl_compute_mo_basis_mo_vgl :CRetType: qmckl_exit_code :FRetType: qmckl_exit_code :END: - #+NAME: qmckl_mo_basis_vgl_args - | ~qmckl_context~ | ~context~ | in | Global state | - | ~int64_t~ | ~ao_num~ | in | Number of AOs | - | ~int64_t~ | ~mo_num~ | in | Number of MOs | - | ~int64_t~ | ~point_num~ | in | Number of points | - | ~double~ | ~coef_normalized[mo_num][ao_num]~ | in | AO to MO transformation matrix | - | ~double~ | ~ao_vgl[point_num][5][ao_num]~ | in | Value, gradients and Laplacian of the AOs | - | ~double~ | ~mo_vgl[point_num][5][mo_num]~ | out | Value, gradients and Laplacian of the MOs | + #+NAME: qmckl_mo_basis_mo_vgl_args + | Variable | Type | In/Out | Description | + |---------------------+--------------------------------+--------+-------------------------------------------------| + | ~context~ | ~qmckl_context~ | in | Global state | + | ~ao_num~ | ~int64_t~ | in | Number of AOs | + | ~mo_num~ | ~int64_t~ | in | Number of MOs | + | ~point_num~ | ~int64_t~ | in | Number of points | + | ~coef_normalized_t~ | ~double[mo_num][ao_num]~ | in | Transpose of the AO to MO transformation matrix | + | ~ao_vgl~ | ~double[point_num][5][ao_num]~ | in | Value, gradients and Laplacian of the AOs | + | ~mo_vgl~ | ~double[point_num][5][mo_num]~ | out | Value, gradients and Laplacian of the MOs | + The matrix of AO values is very sparse, so we use a sparse-dense + matrix multiplication instead of a dgemm, as exposed in + https://dx.doi.org/10.1007/978-3-642-38718-0_14. + + + #+begin_src f90 :comments org :tangle (eval f) :noweb yes -integer function qmckl_compute_mo_basis_vgl_f(context, & +integer function qmckl_compute_mo_basis_mo_vgl_doc_f(context, & ao_num, mo_num, point_num, & - coef_normalized, ao_vgl, mo_vgl) & + coef_normalized_t, ao_vgl, mo_vgl) & result(info) use qmckl implicit none @@ -514,55 +632,69 @@ integer function qmckl_compute_mo_basis_vgl_f(context, & integer*8 , intent(in) :: ao_num, mo_num integer*8 , intent(in) :: point_num double precision , intent(in) :: ao_vgl(ao_num,5,point_num) - double precision , intent(in) :: coef_normalized(ao_num,mo_num) + double precision , intent(in) :: coef_normalized_t(mo_num,ao_num) double precision , intent(out) :: mo_vgl(mo_num,5,point_num) - character :: TransA, TransB - double precision :: alpha, beta - integer*8 :: M, N, K, LDA, LDB, LDC, i,j + integer*8 :: i,j,k + double precision :: c1, c2, c3, c4, c5 - TransA = 'T' - TransB = 'N' - M = mo_num - N = point_num*5_8 - K = int(ao_num,8) - alpha = 1.d0 - beta = 0.d0 - LDA = size(coef_normalized,1) - LDB = size(ao_vgl,1) - LDC = size(mo_vgl,1) + do j=1,point_num + mo_vgl(:,:,j) = 0.d0 + do k=1,ao_num + if (ao_vgl(k,1,j) /= 0.d0) then + c1 = ao_vgl(k,1,j) + c2 = ao_vgl(k,2,j) + c3 = ao_vgl(k,3,j) + c4 = ao_vgl(k,4,j) + c5 = ao_vgl(k,5,j) + do i=1,mo_num + mo_vgl(i,1,j) = mo_vgl(i,1,j) + coef_normalized_t(i,k) * c1 + mo_vgl(i,2,j) = mo_vgl(i,2,j) + coef_normalized_t(i,k) * c2 + mo_vgl(i,3,j) = mo_vgl(i,3,j) + coef_normalized_t(i,k) * c3 + mo_vgl(i,4,j) = mo_vgl(i,4,j) + coef_normalized_t(i,k) * c4 + mo_vgl(i,5,j) = mo_vgl(i,5,j) + coef_normalized_t(i,k) * c5 + end do + end if + end do + end do - info = qmckl_dgemm(context,TransA, TransB, M, N, K, alpha, & - coef_normalized, int(size(coef_normalized,1),8), & - ao_vgl, int(size(ao_vgl,1),8), beta, & - mo_vgl,LDC) - - info = QMCKL_SUCCESS - -end function qmckl_compute_mo_basis_vgl_f +end function qmckl_compute_mo_basis_mo_vgl_doc_f #+end_src - #+CALL: generate_c_header(table=qmckl_mo_basis_vgl_args,rettyp=get_value("CRetType"),fname="qmckl_compute_mo_basis_vgl")) + #+CALL: generate_c_header(table=qmckl_mo_basis_mo_vgl_args,rettyp=get_value("CRetType"),fname="qmckl_compute_mo_basis_mo_vgl")) #+RESULTS: #+begin_src c :tangle (eval h_func) :comments org - qmckl_exit_code qmckl_compute_mo_basis_vgl ( - const qmckl_context context, - const int64_t ao_num, - const int64_t mo_num, - const int64_t point_num, - const double* coef_normalized, - const double* ao_vgl, - double* const mo_vgl ); + qmckl_exit_code qmckl_compute_mo_basis_mo_vgl ( + const qmckl_context context, + const int64_t ao_num, + const int64_t mo_num, + const int64_t point_num, + const double* coef_normalized_t, + const double* ao_vgl, + double* const mo_vgl ); #+end_src + #+CALL: generate_c_header(table=qmckl_mo_basis_mo_vgl_args,rettyp=get_value("CRetType"),fname="qmckl_compute_mo_basis_mo_vgl_doc")) - #+CALL: generate_c_interface(table=qmckl_mo_basis_vgl_args,rettyp=get_value("CRetType"),fname="qmckl_compute_mo_basis_vgl")) + #+RESULTS: + #+begin_src c :tangle (eval h_func) :comments org + qmckl_exit_code qmckl_compute_mo_basis_mo_vgl_doc ( + const qmckl_context context, + const int64_t ao_num, + const int64_t mo_num, + const int64_t point_num, + const double* coef_normalized_t, + const double* ao_vgl, + double* const mo_vgl ); + #+end_src + #+CALL: generate_c_interface(table=qmckl_mo_basis_mo_vgl_args,rettyp=get_value("CRetType"),fname="qmckl_compute_mo_basis_mo_vgl_doc")) + #+RESULTS: #+begin_src f90 :tangle (eval f) :comments org :exports none - integer(c_int32_t) function qmckl_compute_mo_basis_vgl & - (context, ao_num, mo_num, point_num, coef_normalized, ao_vgl, mo_vgl) & - bind(C) result(info) + integer(c_int32_t) function qmckl_compute_mo_basis_mo_vgl_doc & + (context, ao_num, mo_num, point_num, coef_normalized_t, ao_vgl, mo_vgl) & + bind(C) result(info) use, intrinsic :: iso_c_binding implicit none @@ -571,15 +703,176 @@ end function qmckl_compute_mo_basis_vgl_f integer (c_int64_t) , intent(in) , value :: ao_num integer (c_int64_t) , intent(in) , value :: mo_num integer (c_int64_t) , intent(in) , value :: point_num - real (c_double ) , intent(in) :: coef_normalized(ao_num,mo_num) + real (c_double ) , intent(in) :: coef_normalized_t(ao_num,mo_num) real (c_double ) , intent(in) :: ao_vgl(ao_num,5,point_num) real (c_double ) , intent(out) :: mo_vgl(mo_num,5,point_num) - integer(c_int32_t), external :: qmckl_compute_mo_basis_vgl_f - info = qmckl_compute_mo_basis_vgl_f & - (context, ao_num, mo_num, point_num, coef_normalized, ao_vgl, mo_vgl) + integer(c_int32_t), external :: qmckl_compute_mo_basis_mo_vgl_doc_f + info = qmckl_compute_mo_basis_mo_vgl_doc_f & + (context, ao_num, mo_num, point_num, coef_normalized_t, ao_vgl, mo_vgl) - end function qmckl_compute_mo_basis_vgl + end function qmckl_compute_mo_basis_mo_vgl_doc + #+end_src + + #+begin_src c :tangle (eval c) :comments org +qmckl_exit_code +qmckl_compute_mo_basis_mo_vgl (const qmckl_context context, + const int64_t ao_num, + const int64_t mo_num, + const int64_t point_num, + const double* coef_normalized_t, + const double* ao_vgl, + double* const mo_vgl ) +{ +#ifdef HAVE_HPC + return qmckl_compute_mo_basis_mo_vgl_hpc (context, ao_num, mo_num, point_num, coef_normalized_t, ao_vgl, mo_vgl); +#else + return qmckl_compute_mo_basis_mo_vgl_doc (context, ao_num, mo_num, point_num, coef_normalized_t, ao_vgl, mo_vgl); +#endif +} + #+end_src + + +*** HPC version + + + #+begin_src c :tangle (eval h_func) :comments org +#ifdef HAVE_HPC +qmckl_exit_code +qmckl_compute_mo_basis_mo_vgl_hpc (const qmckl_context context, + const int64_t ao_num, + const int64_t mo_num, + const int64_t point_num, + const double* coef_normalized_t, + const double* ao_vgl, + double* const mo_vgl ); +#endif + #+end_src + + #+begin_src c :tangle (eval c) :comments org +#ifdef HAVE_HPC +qmckl_exit_code +qmckl_compute_mo_basis_mo_vgl_hpc (const qmckl_context context, + const int64_t ao_num, + const int64_t mo_num, + const int64_t point_num, + const double* restrict coef_normalized_t, + const double* restrict ao_vgl, + double* restrict const mo_vgl ) +{ +#ifdef HAVE_OPENMP + #pragma omp parallel for +#endif + for (int64_t ipoint=0 ; ipoint < point_num ; ++ipoint) { + double* restrict const vgl1 = &(mo_vgl[ipoint*5*mo_num]); + const double* restrict avgl1 = &(ao_vgl[ipoint*5*ao_num]); + double* restrict const vgl2 = vgl1 + mo_num; + double* restrict const vgl3 = vgl1 + (mo_num << 1); + double* restrict const vgl4 = vgl1 + (mo_num << 1) + mo_num; + double* restrict const vgl5 = vgl1 + (mo_num << 2); + const double* restrict avgl2 = avgl1 + ao_num; + const double* restrict avgl3 = avgl1 + (ao_num << 1); + const double* restrict avgl4 = avgl1 + (ao_num << 1) + ao_num; + const double* restrict avgl5 = avgl1 + (ao_num << 2); + + for (int64_t i=0 ; i -#+STARTUP: align fold nodlcheck hidestars oddeven lognotestate latexpreview +# STARTUP: align fold nodlcheck hidestars oddeven lognotestate latexpreview +#+STARTUP: showeverything + #+AUTHOR: TREX CoE #+LANGUAGE: en